@Override
  public <E> Dataset<E> create(String name, DatasetDescriptor descriptor) {

    Preconditions.checkArgument(name != null, "Name can not be null");
    Preconditions.checkArgument(descriptor != null, "Descriptor can not be null");
    Preconditions.checkArgument(
        descriptor.getLocation() == null,
        "Descriptor location cannot be set; " + "it is assigned by the MetadataProvider");

    DatasetDescriptor newDescriptor = metadataProvider.create(name, descriptor);
    newDescriptor = addRepositoryUri(newDescriptor);

    final URI location = newDescriptor.getLocation();
    if (location == null) {
      throw new DatasetRepositoryException(
          "[BUG] MetadataProvider did not assign a location to dataset:" + name);
    }

    ensureExists(newDescriptor, conf);

    logger.debug(
        "Created dataset:{} schema:{} datasetPath:{}",
        new Object[] {name, newDescriptor.getSchema(), location.toString()});

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(newDescriptor)
        .partitionKey(
            newDescriptor.isPartitioned()
                ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                : null)
        .partitionListener(getPartitionListener())
        .build();
  }
  /**
   * Creates, if necessary, the given the location for {@code descriptor}.
   *
   * @param conf A Configuration
   * @param descriptor A DatasetDescriptor
   */
  static void ensureExists(DatasetDescriptor descriptor, Configuration conf) {
    Preconditions.checkArgument(
        descriptor.getLocation() != null,
        "Cannot get FileSystem for a descriptor with no location");
    final Path dataPath = new Path(descriptor.getLocation());

    final FileSystem fs = fsForPath(dataPath, conf);

    try {
      if (!fs.exists(dataPath)) {
        fs.mkdirs(dataPath);
      }
    } catch (IOException ex) {
      throw new DatasetRepositoryException("Cannot access data location", ex);
    }
  }
Пример #3
0
    public Builder descriptor(DatasetDescriptor descriptor) {
      Preconditions.checkArgument(
          descriptor.getLocation() != null, "Dataset location cannot be null");

      this.descriptor = descriptor;

      return this;
    }
  @Override
  public boolean delete(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");

    logger.debug("Deleting dataset:{}", name);

    DatasetDescriptor descriptor;
    try {
      descriptor = metadataProvider.load(name);
      descriptor = addRepositoryUri(descriptor);
    } catch (DatasetNotFoundException ex) {
      return false;
    }

    boolean changed;
    try {
      // don't care about the return value here -- if it already doesn't exist
      // we still need to delete the data directory
      changed = metadataProvider.delete(name);
    } catch (MetadataProviderException ex) {
      throw new DatasetRepositoryException("Failed to delete descriptor for name:" + name, ex);
    }

    final Path dataLocation = new Path(descriptor.getLocation());
    final FileSystem fs = fsForPath(dataLocation, conf);

    try {
      if (fs.exists(dataLocation)) {
        if (fs.delete(dataLocation, true)) {
          changed = true;
        } else {
          throw new DatasetRepositoryException(
              "Failed to delete dataset name:" + name + " location:" + dataLocation);
        }
      }
    } catch (IOException e) {
      throw new DatasetRepositoryException(
          "Internal failure when removing location:" + dataLocation);
    }

    return changed;
  }
Пример #5
0
    public <E> FileSystemDataset<E> build() {
      Preconditions.checkState(this.name != null, "No dataset name defined");
      Preconditions.checkState(this.descriptor != null, "No dataset descriptor defined");
      Preconditions.checkState(
          (conf != null) || (fileSystem != null), "Configuration or FileSystem must be set");

      this.directory = new Path(descriptor.getLocation());

      if (fileSystem == null) {
        try {
          this.fileSystem = directory.getFileSystem(conf);
        } catch (IOException ex) {
          throw new DatasetException("Cannot access FileSystem", ex);
        }
      }

      Path absoluteDirectory = fileSystem.makeQualified(directory);
      return new FileSystemDataset<E>(
          fileSystem, absoluteDirectory, name, descriptor, partitionKey, partitionListener);
    }
Пример #6
0
 /**
  * Checks that the {@code existing} {@link DatasetDescriptor} can be replaced by {@code updated}.
  *
  * @param existing the current {@code DatasetDescriptor} for a dataset
  * @param updated a new {@code DatasetDescriptor} for the same dataset
  */
 public static void checkUpdate(DatasetDescriptor existing, DatasetDescriptor updated) {
   checkNotChanged("location", existing.getLocation(), updated.getLocation());
   checkCompatible(existing, updated);
 }
  @Override
  public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");
    Preconditions.checkArgument(descriptor != null, "DatasetDescriptro cannot be null");

    DatasetDescriptor oldDescriptor = metadataProvider.load(name);

    // oldDescriptor is valid if load didn't throw NoSuchDatasetException

    if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) {
      throw new DatasetRepositoryException(
          "Cannot change dataset format from "
              + oldDescriptor.getFormat()
              + " to "
              + descriptor.getFormat());
    }

    final URI oldLocation = oldDescriptor.getLocation();
    if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) {
      throw new DatasetRepositoryException("Cannot change the dataset's location");
    }

    if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) {
      throw new DatasetRepositoryException(
          "Cannot change an unpartitioned dataset to " + " partitioned or vice versa.");
    } else if (oldDescriptor.isPartitioned()
        && descriptor.isPartitioned()
        && !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) {
      throw new DatasetRepositoryException(
          "Cannot change partition strategy from "
              + oldDescriptor.getPartitionStrategy()
              + " to "
              + descriptor.getPartitionStrategy());
    }

    // check can read records written with old schema using new schema
    final Schema oldSchema = oldDescriptor.getSchema();
    final Schema newSchema = descriptor.getSchema();
    if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) {
      throw new IncompatibleSchemaException(
          "New schema cannot read data "
              + "written using "
              + "old schema. New schema: "
              + newSchema.toString(true)
              + "\nOld schema: "
              + oldSchema.toString(true));
    }

    DatasetDescriptor updatedDescriptor = metadataProvider.update(name, descriptor);
    updatedDescriptor = addRepositoryUri(updatedDescriptor);

    logger.debug(
        "Updated dataset:{} schema:{} datasetPath:{}",
        new Object[] {
          name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation().toString()
        });

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(updatedDescriptor)
        .partitionKey(
            updatedDescriptor.isPartitioned()
                ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                : null)
        .partitionListener(getPartitionListener())
        .build();
  }