@Override
  public <E> Dataset<E> create(String name, DatasetDescriptor descriptor) {

    Preconditions.checkArgument(name != null, "Name can not be null");
    Preconditions.checkArgument(descriptor != null, "Descriptor can not be null");
    Preconditions.checkArgument(
        descriptor.getLocation() == null,
        "Descriptor location cannot be set; " + "it is assigned by the MetadataProvider");

    DatasetDescriptor newDescriptor = metadataProvider.create(name, descriptor);
    newDescriptor = addRepositoryUri(newDescriptor);

    final URI location = newDescriptor.getLocation();
    if (location == null) {
      throw new DatasetRepositoryException(
          "[BUG] MetadataProvider did not assign a location to dataset:" + name);
    }

    ensureExists(newDescriptor, conf);

    logger.debug(
        "Created dataset:{} schema:{} datasetPath:{}",
        new Object[] {name, newDescriptor.getSchema(), location.toString()});

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(newDescriptor)
        .partitionKey(
            newDescriptor.isPartitioned()
                ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                : null)
        .partitionListener(getPartitionListener())
        .build();
  }
  @Override
  public <E> Dataset<E> load(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");

    logger.debug("Loading dataset:{}", name);

    DatasetDescriptor descriptor = metadataProvider.load(name);
    descriptor = addRepositoryUri(descriptor);

    FileSystemDataset<E> ds =
        new FileSystemDataset.Builder()
            .name(name)
            .configuration(conf)
            .descriptor(descriptor)
            .partitionKey(
                descriptor.isPartitioned()
                    ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                    : null)
            .partitionListener(getPartitionListener())
            .build();

    logger.debug("Loaded dataset:{}", ds);

    return ds;
  }
  public PartitionedDatasetWriter(FileSystemView<E> view) {
    final DatasetDescriptor descriptor = view.getDataset().getDescriptor();
    Preconditions.checkArgument(
        descriptor.isPartitioned(), "Dataset " + view.getDataset() + " is not partitioned");

    this.view = view;
    this.partitionStrategy = descriptor.getPartitionStrategy();
    this.maxWriters = Math.min(10, partitionStrategy.getCardinality());
    this.state = ReaderWriterState.NEW;
    this.reusedKey = new StorageKey(partitionStrategy);
  }
Example #4
0
 private GetStorageKey(View<E> view) {
   DatasetDescriptor descriptor = view.getDataset().getDescriptor();
   // get serializable versions of transient objects
   this.strategyString = descriptor.getPartitionStrategy().toString(false /* no white space */);
   this.schemaString = descriptor.getSchema().toString(false /* no white space */);
   this.type = view.getType();
   if (view instanceof AbstractRefinableView) {
     this.constraints = ((AbstractRefinableView) view).getConstraints().toQueryMap();
   } else {
     this.constraints = null;
   }
 }
Example #5
0
 /**
  * Partitions {@code collection} to be stored efficiently in {@code View}.
  *
  * <p>This restructures the parallel collection so that all of the entities that will be stored in
  * a given partition will be processed by the same writer.
  *
  * <p>If the dataset is not partitioned, then this will structure all of the entities to produce a
  * number of files equal to {@code numWriters}.
  *
  * @param collection a collection of entities
  * @param view a {@link View} of a dataset to partition the collection for
  * @param numWriters the number of writers that should be used
  * @param <E> the type of entities in the collection and underlying dataset
  * @return an equivalent collection of entities partitioned for the view
  * @see #partition(PCollection, View)
  * @since 0.16.0
  */
 public static <E> PCollection<E> partition(
     PCollection<E> collection, View<E> view, int numWriters) {
   DatasetDescriptor descriptor = view.getDataset().getDescriptor();
   if (descriptor.isPartitioned()) {
     GetStorageKey<E> getKey = new GetStorageKey<E>(view);
     PTable<GenericData.Record, E> table = collection.by(getKey, Avros.generics(getKey.schema()));
     PGroupedTable<GenericData.Record, E> grouped =
         numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
     return grouped.ungroup().values();
   } else {
     return partition(collection, numWriters);
   }
 }
Example #6
0
  private FileSystemWriter(FileSystem fs, Path path, DatasetDescriptor descriptor) {
    Preconditions.checkNotNull(fs, "File system is not defined");
    Preconditions.checkNotNull(path, "Destination directory is not defined");
    Preconditions.checkNotNull(descriptor, "Descriptor is not defined");
    this.fs = fs;
    this.directory = path;
    this.descriptor = descriptor;
    this.conf = new Configuration(fs.getConf());
    this.state = ReaderWriterState.NEW;

    // copy file format settings from custom properties to the Configuration
    for (String prop : descriptor.listProperties()) {
      conf.set(prop, descriptor.getProperty(prop));
    }
  }
Example #7
0
 static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
   DatasetDescriptor descriptor = view.getDataset().getDescriptor();
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, Parquet is not durable
     if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return new IncrementalPartitionedDatasetWriter<E>(view);
     } else {
       return new NonDurablePartitionedDatasetWriter<E>(view);
     }
   } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
     return new IncrementalPartitionedDatasetWriter<E>(view);
   } else {
     return new NonDurablePartitionedDatasetWriter<E>(view);
   }
 }
  /**
   * Creates, if necessary, the given the location for {@code descriptor}.
   *
   * @param conf A Configuration
   * @param descriptor A DatasetDescriptor
   */
  static void ensureExists(DatasetDescriptor descriptor, Configuration conf) {
    Preconditions.checkArgument(
        descriptor.getLocation() != null,
        "Cannot get FileSystem for a descriptor with no location");
    final Path dataPath = new Path(descriptor.getLocation());

    final FileSystem fs = fsForPath(dataPath, conf);

    try {
      if (!fs.exists(dataPath)) {
        fs.mkdirs(dataPath);
      }
    } catch (IOException ex) {
      throw new DatasetRepositoryException("Cannot access data location", ex);
    }
  }
Example #9
0
  @Override
  public final void initialize() {
    Preconditions.checkState(
        state.equals(ReaderWriterState.NEW), "Unable to open a writer from state:%s", state);

    // ensure the directory exists
    try {
      fs.mkdirs(directory);
    } catch (IOException ex) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetIOException("Failed to create path " + directory, ex);
    }

    // initialize paths
    this.finalPath = new Path(directory, uniqueFilename(descriptor.getFormat()));
    this.tempPath = tempFilename(finalPath);
    this.appender = newAppender(tempPath);

    try {
      appender.open();
    } catch (IOException e) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetIOException("Failed to open appender " + appender, e);
    }

    this.count = 0;

    LOG.debug("Opened appender {} for {}", appender, finalPath);

    this.state = ReaderWriterState.OPEN;
  }
Example #10
0
    public Builder descriptor(DatasetDescriptor descriptor) {
      Preconditions.checkArgument(
          descriptor.getLocation() != null, "Dataset location cannot be null");

      this.descriptor = descriptor;

      return this;
    }
Example #11
0
 @VisibleForTesting
 @SuppressWarnings("unchecked")
 <E> FileAppender<E> newAppender(Path temp) {
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, Parquet is not durable
     if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return (FileAppender<E>)
           new DurableParquetAppender(
               fs, temp, descriptor.getSchema(), conf, descriptor.getCompressionType());
     } else {
       return (FileAppender<E>)
           new ParquetAppender(
               fs, temp, descriptor.getSchema(), conf, descriptor.getCompressionType());
     }
   } else if (Formats.AVRO.equals(format)) {
     return new AvroAppender<E>(fs, temp, descriptor.getSchema(), descriptor.getCompressionType());
   } else if (Formats.CSV.equals(format)
       && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) {
     return new CSVAppender<E>(fs, temp, descriptor);
   } else {
     this.state = ReaderWriterState.ERROR;
     throw new UnknownFormatException("Unknown format " + descriptor);
   }
 }
Example #12
0
  @Override
  public final void initialize() {
    Preconditions.checkState(
        state.equals(ReaderWriterState.NEW), "Unable to open a writer from state:%s", state);

    ValidationException.check(
        isSupportedFormat(descriptor), "Not a supported format: %s", descriptor.getFormat());

    // ensure the directory exists
    try {
      fs.mkdirs(directory);
    } catch (RuntimeException e) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetOperationException(e, "Failed to create path %s", directory);
    } catch (IOException ex) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetIOException("Failed to create path " + directory, ex);
    }

    // initialize paths
    try {
      this.finalPath = new Path(directory, uniqueFilename(descriptor.getFormat()));
      this.tempPath = tempFilename(finalPath);
    } catch (RuntimeException e) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetOperationException(e, "Failed to initialize file paths under %s", directory);
    }

    try {
      this.appender = newAppender(tempPath);
      appender.open();
    } catch (RuntimeException e) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetOperationException(e, "Failed to open appender %s", appender);
    } catch (IOException e) {
      this.state = ReaderWriterState.ERROR;
      throw new DatasetIOException("Failed to open appender " + appender, e);
    }

    this.count = 0;

    LOG.info("Opened output appender {} for {}", appender, finalPath);

    this.state = ReaderWriterState.OPEN;
  }
Example #13
0
  FileSystemDataset(
      FileSystem fileSystem,
      Path directory,
      String name,
      DatasetDescriptor descriptor,
      @Nullable PartitionListener partitionListener) {

    this.fileSystem = fileSystem;
    this.directory = directory;
    this.name = name;
    this.descriptor = descriptor;
    this.partitionStrategy = descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
    this.partitionListener = partitionListener;
    this.convert = new PathConversion();

    this.unbounded = new FileSystemView<E>(this);
    // remove this.partitionKey for 0.13.0
    this.partitionKey = null;
  }
  @Override
  public boolean delete(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");

    logger.debug("Deleting dataset:{}", name);

    DatasetDescriptor descriptor;
    try {
      descriptor = metadataProvider.load(name);
      descriptor = addRepositoryUri(descriptor);
    } catch (DatasetNotFoundException ex) {
      return false;
    }

    boolean changed;
    try {
      // don't care about the return value here -- if it already doesn't exist
      // we still need to delete the data directory
      changed = metadataProvider.delete(name);
    } catch (MetadataProviderException ex) {
      throw new DatasetRepositoryException("Failed to delete descriptor for name:" + name, ex);
    }

    final Path dataLocation = new Path(descriptor.getLocation());
    final FileSystem fs = fsForPath(dataLocation, conf);

    try {
      if (fs.exists(dataLocation)) {
        if (fs.delete(dataLocation, true)) {
          changed = true;
        } else {
          throw new DatasetRepositoryException(
              "Failed to delete dataset name:" + name + " location:" + dataLocation);
        }
      }
    } catch (IOException e) {
      throw new DatasetRepositoryException(
          "Internal failure when removing location:" + dataLocation);
    }

    return changed;
  }
Example #15
0
  @Override
  public void initialize() {
    Preconditions.checkState(
        state.equals(ReaderWriterState.NEW), "Unable to open a writer from state:%s", state);

    DatasetDescriptor descriptor = view.getDataset().getDescriptor();
    ValidationException.check(
        FileSystemWriter.isSupportedFormat(descriptor),
        "Not a supported format: %s",
        descriptor.getFormat());

    LOG.debug("Opening partitioned dataset writer w/strategy:{}", partitionStrategy);

    cachedWriters =
        CacheBuilder.newBuilder()
            .maximumSize(maxWriters)
            .removalListener(new DatasetWriterCloser<E>())
            .build(createCacheLoader());

    state = ReaderWriterState.OPEN;
  }
Example #16
0
 @SuppressWarnings("unchecked")
 private <E> FileAppender<E> newAppender(Path temp) {
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, guarantee durability with the more costly writer
     if (DescriptorUtil.isEnabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return (FileAppender<E>) new ParquetAppender(fs, temp, descriptor.getSchema(), conf, true);
     } else {
       return (FileAppender<E>)
           new DurableParquetAppender(fs, temp, descriptor.getSchema(), conf, true);
     }
   } else if (Formats.AVRO.equals(format)) {
     return new AvroAppender<E>(fs, temp, descriptor.getSchema(), true);
   } else if (Formats.CSV.equals(format)
       && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) {
     return new CSVAppender<E>(fs, temp, descriptor);
   } else {
     this.state = ReaderWriterState.ERROR;
     throw new DatasetWriterException("Unknown format " + descriptor);
   }
 }
Example #17
0
  private PartitionedDatasetWriter(FileSystemView<E> view) {
    final DatasetDescriptor descriptor = view.getDataset().getDescriptor();
    Preconditions.checkArgument(
        descriptor.isPartitioned(), "Dataset " + view.getDataset() + " is not partitioned");

    this.view = view;
    this.partitionStrategy = descriptor.getPartitionStrategy();

    int maxWriters = DEFAULT_WRITER_CACHE_SIZE;
    if (descriptor.hasProperty(FileSystemProperties.WRITER_CACHE_SIZE_PROP)) {
      try {
        maxWriters =
            Integer.parseInt(descriptor.getProperty(FileSystemProperties.WRITER_CACHE_SIZE_PROP));
      } catch (NumberFormatException e) {
        LOG.warn(
            "Not an integer: "
                + FileSystemProperties.WRITER_CACHE_SIZE_PROP
                + "="
                + descriptor.getProperty(FileSystemProperties.WRITER_CACHE_SIZE_PROP));
      }
    } else if (partitionStrategy.getCardinality() != FieldPartitioner.UNKNOWN_CARDINALITY) {
      maxWriters = Math.min(maxWriters, partitionStrategy.getCardinality());
    }
    this.maxWriters = maxWriters;

    this.state = ReaderWriterState.NEW;
    this.reusedKey = new StorageKey(partitionStrategy);
    this.accessor = view.getAccessor();
    this.provided = view.getProvidedValues();
  }
Example #18
0
  /**
   * Precondition-style validation that the DatasetDescriptor is compatible.
   *
   * @param descriptor a {@link DatasetDescriptor}
   */
  public static void checkDescriptor(DatasetDescriptor descriptor) {
    Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");

    Schema schema = descriptor.getSchema();
    checkSchema(schema);

    if (descriptor.isPartitioned()) {
      // marked as [BUG] because this is checked in DatasetDescriptor
      Preconditions.checkArgument(
          schema.getType() == Schema.Type.RECORD,
          "[BUG] Partitioned datasets must have record schemas");

      Set<String> names = Sets.newHashSet();
      for (Schema.Field field : schema.getFields()) {
        names.add(field.name());
      }

      List<String> incompatible = Lists.newArrayList();
      List<String> duplicates = Lists.newArrayList();
      for (FieldPartitioner fp : descriptor.getPartitionStrategy().getFieldPartitioners()) {
        String name = fp.getName();
        if (!isCompatibleName(name)) {
          incompatible.add(name);
        } else if (names.contains(name)) {
          duplicates.add(name);
        } else {
          names.add(name);
        }
      }
      Preconditions.checkState(
          incompatible.isEmpty(),
          "Hive incompatible: partition names are not alphanumeric (plus '_'): %s",
          Joiner.on(", ").join(incompatible));
      Preconditions.checkState(
          duplicates.isEmpty(),
          "Hive incompatible: partition names duplicate data fields: %s",
          Joiner.on(", ").join(duplicates));
    }
  }
Example #19
0
 static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, DatasetDescriptor descriptor) {
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, Parquet is not durable
     if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return new IncrementalWriter<E>(fs, path, descriptor);
     } else {
       return new FileSystemWriter<E>(fs, path, descriptor);
     }
   } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
     return new IncrementalWriter<E>(fs, path, descriptor);
   } else {
     return new FileSystemWriter<E>(fs, path, descriptor);
   }
 }
Example #20
0
  /**
   * Checks that the {@code existing} {@link DatasetDescriptor} is compatible with {@code test}.
   *
   * @param existing the current {@code DatasetDescriptor} for a dataset
   * @param test a new {@code DatasetDescriptor} for the same dataset
   */
  public static void checkCompatible(DatasetDescriptor existing, DatasetDescriptor test) {
    checkNotChanged("format", existing.getFormat(), test.getFormat());

    checkNotChanged("partitioning", existing.isPartitioned(), test.isPartitioned());

    if (existing.isPartitioned()) {
      checkNotChanged(
          "partition strategy", existing.getPartitionStrategy(), test.getPartitionStrategy());
    }

    // check can read records written with old schema using new schema
    Schema oldSchema = existing.getSchema();
    Schema testSchema = test.getSchema();
    if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
      throw new IncompatibleSchemaException(
          "Schema cannot read data "
              + "written using existing schema. Schema: "
              + testSchema.toString(true)
              + "\nExisting schema: "
              + oldSchema.toString(true));
    }
  }
Example #21
0
  @Override
  @Nullable
  @Deprecated
  public Dataset<E> getPartition(PartitionKey key, boolean allowCreate) {
    Preconditions.checkState(
        descriptor.isPartitioned(),
        "Attempt to get a partition on a non-partitioned dataset (name:%s)",
        name);

    logger.debug("Loading partition for key {}, allowCreate:{}", new Object[] {key, allowCreate});

    Path partitionDirectory = fileSystem.makeQualified(toDirectoryName(directory, key));

    try {
      if (!fileSystem.exists(partitionDirectory)) {
        if (allowCreate) {
          fileSystem.mkdirs(partitionDirectory);
          if (partitionListener != null) {
            partitionListener.partitionAdded(name, toRelativeDirectory(key).toString());
          }
        } else {
          return null;
        }
      }
    } catch (IOException e) {
      throw new DatasetException(
          "Unable to locate or create dataset partition directory " + partitionDirectory, e);
    }

    int partitionDepth = key.getLength();
    PartitionStrategy subpartitionStrategy =
        Accessor.getDefault().getSubpartitionStrategy(partitionStrategy, partitionDepth);

    return new FileSystemDataset.Builder()
        .name(name)
        .fileSystem(fileSystem)
        .descriptor(
            new DatasetDescriptor.Builder(descriptor)
                .location(partitionDirectory)
                .partitionStrategy(subpartitionStrategy)
                .build())
        .partitionKey(key)
        .partitionListener(partitionListener)
        .build();
  }
Example #22
0
    public <E> FileSystemDataset<E> build() {
      Preconditions.checkState(this.name != null, "No dataset name defined");
      Preconditions.checkState(this.descriptor != null, "No dataset descriptor defined");
      Preconditions.checkState(
          (conf != null) || (fileSystem != null), "Configuration or FileSystem must be set");

      this.directory = new Path(descriptor.getLocation());

      if (fileSystem == null) {
        try {
          this.fileSystem = directory.getFileSystem(conf);
        } catch (IOException ex) {
          throw new DatasetException("Cannot access FileSystem", ex);
        }
      }

      Path absoluteDirectory = fileSystem.makeQualified(directory);
      return new FileSystemDataset<E>(
          fileSystem, absoluteDirectory, name, descriptor, partitionKey, partitionListener);
    }
Example #23
0
  @Override
  @Deprecated
  public Iterable<Dataset<E>> getPartitions() {
    Preconditions.checkState(
        descriptor.isPartitioned(),
        "Attempt to get partitions on a non-partitioned dataset (name:%s)",
        name);

    List<Dataset<E>> partitions = Lists.newArrayList();

    FileStatus[] fileStatuses;

    try {
      fileStatuses = fileSystem.listStatus(directory, PathFilters.notHidden());
    } catch (IOException e) {
      throw new DatasetException(
          "Unable to list partition directory for directory " + directory, e);
    }

    for (FileStatus stat : fileStatuses) {
      Path p = fileSystem.makeQualified(stat.getPath());
      PartitionKey key = fromDirectoryName(p);
      PartitionStrategy subPartitionStrategy =
          Accessor.getDefault().getSubpartitionStrategy(partitionStrategy, 1);
      Builder builder =
          new FileSystemDataset.Builder()
              .name(name)
              .fileSystem(fileSystem)
              .descriptor(
                  new DatasetDescriptor.Builder(descriptor)
                      .location(p)
                      .partitionStrategy(subPartitionStrategy)
                      .build())
              .partitionKey(key)
              .partitionListener(partitionListener);

      partitions.add(builder.<E>build());
    }

    return partitions;
  }
Example #24
0
  @Override
  @Deprecated
  public void dropPartition(PartitionKey key) {
    Preconditions.checkState(
        descriptor.isPartitioned(),
        "Attempt to drop a partition on a non-partitioned dataset (name:%s)",
        name);
    Preconditions.checkArgument(key != null, "Partition key may not be null");

    logger.debug("Dropping partition with key:{} dataset:{}", key, name);

    Path partitionDirectory = toDirectoryName(directory, key);

    try {
      if (!fileSystem.delete(partitionDirectory, true)) {
        throw new DatasetException(
            "Partition directory " + partitionDirectory + " for key " + key + " does not exist");
      }
    } catch (IOException e) {
      throw new DatasetException(
          "Unable to locate or drop dataset partition directory " + partitionDirectory, e);
    }
  }
Example #25
0
 /**
  * Checks that the {@code existing} {@link DatasetDescriptor} can be replaced by {@code updated}.
  *
  * @param existing the current {@code DatasetDescriptor} for a dataset
  * @param updated a new {@code DatasetDescriptor} for the same dataset
  */
 public static void checkUpdate(DatasetDescriptor existing, DatasetDescriptor updated) {
   checkNotChanged("location", existing.getLocation(), updated.getLocation());
   checkCompatible(existing, updated);
 }
Example #26
0
 static boolean isSupportedFormat(DatasetDescriptor descriptor) {
   Format format = descriptor.getFormat();
   return (SUPPORTED_FORMATS.contains(format)
       || (Formats.CSV.equals(format)
           && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)));
 }
  @Override
  public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");
    Preconditions.checkArgument(descriptor != null, "DatasetDescriptro cannot be null");

    DatasetDescriptor oldDescriptor = metadataProvider.load(name);

    // oldDescriptor is valid if load didn't throw NoSuchDatasetException

    if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) {
      throw new DatasetRepositoryException(
          "Cannot change dataset format from "
              + oldDescriptor.getFormat()
              + " to "
              + descriptor.getFormat());
    }

    final URI oldLocation = oldDescriptor.getLocation();
    if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) {
      throw new DatasetRepositoryException("Cannot change the dataset's location");
    }

    if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) {
      throw new DatasetRepositoryException(
          "Cannot change an unpartitioned dataset to " + " partitioned or vice versa.");
    } else if (oldDescriptor.isPartitioned()
        && descriptor.isPartitioned()
        && !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) {
      throw new DatasetRepositoryException(
          "Cannot change partition strategy from "
              + oldDescriptor.getPartitionStrategy()
              + " to "
              + descriptor.getPartitionStrategy());
    }

    // check can read records written with old schema using new schema
    final Schema oldSchema = oldDescriptor.getSchema();
    final Schema newSchema = descriptor.getSchema();
    if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) {
      throw new IncompatibleSchemaException(
          "New schema cannot read data "
              + "written using "
              + "old schema. New schema: "
              + newSchema.toString(true)
              + "\nOld schema: "
              + oldSchema.toString(true));
    }

    DatasetDescriptor updatedDescriptor = metadataProvider.update(name, descriptor);
    updatedDescriptor = addRepositoryUri(updatedDescriptor);

    logger.debug(
        "Updated dataset:{} schema:{} datasetPath:{}",
        new Object[] {
          name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation().toString()
        });

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(updatedDescriptor)
        .partitionKey(
            updatedDescriptor.isPartitioned()
                ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                : null)
        .partitionListener(getPartitionListener())
        .build();
  }
Example #28
0
  @Override
  public void merge(FileSystemDataset<E> update) {
    DatasetDescriptor updateDescriptor = update.getDescriptor();

    if (!updateDescriptor.getFormat().equals(descriptor.getFormat())) {
      throw new DatasetRepositoryException(
          "Cannot merge dataset format "
              + updateDescriptor.getFormat()
              + " with format "
              + descriptor.getFormat());
    }

    if (updateDescriptor.isPartitioned() != descriptor.isPartitioned()) {
      throw new DatasetRepositoryException(
          "Cannot merge an unpartitioned dataset with a " + " partitioned one or vice versa.");
    } else if (updateDescriptor.isPartitioned()
        && descriptor.isPartitioned()
        && !updateDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) {
      throw new DatasetRepositoryException(
          "Cannot merge dataset partition strategy "
              + updateDescriptor.getPartitionStrategy()
              + " with "
              + descriptor.getPartitionStrategy());
    }

    if (!updateDescriptor.getSchema().equals(descriptor.getSchema())) {
      throw new DatasetRepositoryException(
          "Cannot merge dataset schema "
              + updateDescriptor.getFormat()
              + " with schema "
              + descriptor.getFormat());
    }

    Set<String> addedPartitions = Sets.newHashSet();
    for (Path path : update.pathIterator()) {
      URI relativePath = update.getDirectory().toUri().relativize(path.toUri());
      Path newPath = new Path(directory, new Path(relativePath));
      Path newPartitionDirectory = newPath.getParent();
      try {
        if (!fileSystem.exists(newPartitionDirectory)) {
          fileSystem.mkdirs(newPartitionDirectory);
        }
        logger.debug("Renaming {} to {}", path, newPath);
        boolean renameOk = fileSystem.rename(path, newPath);
        if (!renameOk) {
          throw new DatasetException(
              "Dataset merge failed during rename of " + path + " to " + newPath);
        }
      } catch (IOException e) {
        throw new DatasetIOException("Dataset merge failed", e);
      }
      if (descriptor.isPartitioned() && partitionListener != null) {
        String partition = newPartitionDirectory.toString();
        if (!addedPartitions.contains(partition)) {
          partitionListener.partitionAdded(name, partition);
          addedPartitions.add(partition);
        }
      }
    }
  }