@Override public <E> Dataset<E> create(String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(name != null, "Name can not be null"); Preconditions.checkArgument(descriptor != null, "Descriptor can not be null"); Preconditions.checkArgument( descriptor.getLocation() == null, "Descriptor location cannot be set; " + "it is assigned by the MetadataProvider"); DatasetDescriptor newDescriptor = metadataProvider.create(name, descriptor); newDescriptor = addRepositoryUri(newDescriptor); final URI location = newDescriptor.getLocation(); if (location == null) { throw new DatasetRepositoryException( "[BUG] MetadataProvider did not assign a location to dataset:" + name); } ensureExists(newDescriptor, conf); logger.debug( "Created dataset:{} schema:{} datasetPath:{}", new Object[] {name, newDescriptor.getSchema(), location.toString()}); return new FileSystemDataset.Builder() .name(name) .configuration(conf) .descriptor(newDescriptor) .partitionKey( newDescriptor.isPartitioned() ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .partitionListener(getPartitionListener()) .build(); }
/** * Creates, if necessary, the given the location for {@code descriptor}. * * @param conf A Configuration * @param descriptor A DatasetDescriptor */ static void ensureExists(DatasetDescriptor descriptor, Configuration conf) { Preconditions.checkArgument( descriptor.getLocation() != null, "Cannot get FileSystem for a descriptor with no location"); final Path dataPath = new Path(descriptor.getLocation()); final FileSystem fs = fsForPath(dataPath, conf); try { if (!fs.exists(dataPath)) { fs.mkdirs(dataPath); } } catch (IOException ex) { throw new DatasetRepositoryException("Cannot access data location", ex); } }
public Builder descriptor(DatasetDescriptor descriptor) { Preconditions.checkArgument( descriptor.getLocation() != null, "Dataset location cannot be null"); this.descriptor = descriptor; return this; }
@Override public boolean delete(String name) { Preconditions.checkArgument(name != null, "Name can not be null"); logger.debug("Deleting dataset:{}", name); DatasetDescriptor descriptor; try { descriptor = metadataProvider.load(name); descriptor = addRepositoryUri(descriptor); } catch (DatasetNotFoundException ex) { return false; } boolean changed; try { // don't care about the return value here -- if it already doesn't exist // we still need to delete the data directory changed = metadataProvider.delete(name); } catch (MetadataProviderException ex) { throw new DatasetRepositoryException("Failed to delete descriptor for name:" + name, ex); } final Path dataLocation = new Path(descriptor.getLocation()); final FileSystem fs = fsForPath(dataLocation, conf); try { if (fs.exists(dataLocation)) { if (fs.delete(dataLocation, true)) { changed = true; } else { throw new DatasetRepositoryException( "Failed to delete dataset name:" + name + " location:" + dataLocation); } } } catch (IOException e) { throw new DatasetRepositoryException( "Internal failure when removing location:" + dataLocation); } return changed; }
public <E> FileSystemDataset<E> build() { Preconditions.checkState(this.name != null, "No dataset name defined"); Preconditions.checkState(this.descriptor != null, "No dataset descriptor defined"); Preconditions.checkState( (conf != null) || (fileSystem != null), "Configuration or FileSystem must be set"); this.directory = new Path(descriptor.getLocation()); if (fileSystem == null) { try { this.fileSystem = directory.getFileSystem(conf); } catch (IOException ex) { throw new DatasetException("Cannot access FileSystem", ex); } } Path absoluteDirectory = fileSystem.makeQualified(directory); return new FileSystemDataset<E>( fileSystem, absoluteDirectory, name, descriptor, partitionKey, partitionListener); }
/** * Checks that the {@code existing} {@link DatasetDescriptor} can be replaced by {@code updated}. * * @param existing the current {@code DatasetDescriptor} for a dataset * @param updated a new {@code DatasetDescriptor} for the same dataset */ public static void checkUpdate(DatasetDescriptor existing, DatasetDescriptor updated) { checkNotChanged("location", existing.getLocation(), updated.getLocation()); checkCompatible(existing, updated); }
@Override public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(name != null, "Dataset name cannot be null"); Preconditions.checkArgument(descriptor != null, "DatasetDescriptro cannot be null"); DatasetDescriptor oldDescriptor = metadataProvider.load(name); // oldDescriptor is valid if load didn't throw NoSuchDatasetException if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) { throw new DatasetRepositoryException( "Cannot change dataset format from " + oldDescriptor.getFormat() + " to " + descriptor.getFormat()); } final URI oldLocation = oldDescriptor.getLocation(); if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) { throw new DatasetRepositoryException("Cannot change the dataset's location"); } if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) { throw new DatasetRepositoryException( "Cannot change an unpartitioned dataset to " + " partitioned or vice versa."); } else if (oldDescriptor.isPartitioned() && descriptor.isPartitioned() && !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) { throw new DatasetRepositoryException( "Cannot change partition strategy from " + oldDescriptor.getPartitionStrategy() + " to " + descriptor.getPartitionStrategy()); } // check can read records written with old schema using new schema final Schema oldSchema = oldDescriptor.getSchema(); final Schema newSchema = descriptor.getSchema(); if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) { throw new IncompatibleSchemaException( "New schema cannot read data " + "written using " + "old schema. New schema: " + newSchema.toString(true) + "\nOld schema: " + oldSchema.toString(true)); } DatasetDescriptor updatedDescriptor = metadataProvider.update(name, descriptor); updatedDescriptor = addRepositoryUri(updatedDescriptor); logger.debug( "Updated dataset:{} schema:{} datasetPath:{}", new Object[] { name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation().toString() }); return new FileSystemDataset.Builder() .name(name) .configuration(conf) .descriptor(updatedDescriptor) .partitionKey( updatedDescriptor.isPartitioned() ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .partitionListener(getPartitionListener()) .build(); }