@Override public <E> Dataset<E> load(String name) { Preconditions.checkArgument(name != null, "Name can not be null"); logger.debug("Loading dataset:{}", name); DatasetDescriptor descriptor = metadataProvider.load(name); descriptor = addRepositoryUri(descriptor); FileSystemDataset<E> ds = new FileSystemDataset.Builder() .name(name) .configuration(conf) .descriptor(descriptor) .partitionKey( descriptor.isPartitioned() ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .partitionListener(getPartitionListener()) .build(); logger.debug("Loaded dataset:{}", ds); return ds; }
@Override public <E> Dataset<E> create(String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(name != null, "Name can not be null"); Preconditions.checkArgument(descriptor != null, "Descriptor can not be null"); Preconditions.checkArgument( descriptor.getLocation() == null, "Descriptor location cannot be set; " + "it is assigned by the MetadataProvider"); DatasetDescriptor newDescriptor = metadataProvider.create(name, descriptor); newDescriptor = addRepositoryUri(newDescriptor); final URI location = newDescriptor.getLocation(); if (location == null) { throw new DatasetRepositoryException( "[BUG] MetadataProvider did not assign a location to dataset:" + name); } ensureExists(newDescriptor, conf); logger.debug( "Created dataset:{} schema:{} datasetPath:{}", new Object[] {name, newDescriptor.getSchema(), location.toString()}); return new FileSystemDataset.Builder() .name(name) .configuration(conf) .descriptor(newDescriptor) .partitionKey( newDescriptor.isPartitioned() ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .partitionListener(getPartitionListener()) .build(); }
/** * Get a {@link org.kitesdk.data.PartitionKey} corresponding to a partition's filesystem path * represented as a {@link URI}. If the path is not a valid partition, then {@link * IllegalArgumentException} is thrown. Note that the partition does not have to exist. * * @param dataset the filesystem dataset * @param partitionPath a directory path where the partition data is stored * @return a partition key representing the partition at the given path * @since 0.4.0 */ @SuppressWarnings("deprecation") public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) { Preconditions.checkState( dataset.getDescriptor().isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName()); Preconditions.checkArgument( dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset"); FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileSystem fs = fsDataset.getFileSystem(); URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri(); URI directoryUri = fsDataset.getDirectory().toUri(); URI relativizedUri = directoryUri.relativize(partitionUri); if (relativizedUri.equals(partitionUri)) { throw new IllegalArgumentException( String.format( "Partition URI %s has different " + "root directory to dataset (directory: %s).", partitionUri, directoryUri)); } Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath()); PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners(); if (Iterables.size(parts) > fieldPartitioners.size()) { throw new IllegalArgumentException( String.format( "Too many partition directories " + "for %s (%s), expecting %s.", partitionUri, Iterables.size(parts), fieldPartitioners.size())); } List<Object> values = Lists.newArrayList(); int i = 0; for (String part : parts) { Iterator<String> split = Splitter.on('=').split(part).iterator(); String fieldName = split.next(); FieldPartitioner fp = fieldPartitioners.get(i++); if (!fieldName.equals(fp.getName())) { throw new IllegalArgumentException( String.format( "Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.", fieldName, partitionUri, fp.getName())); } if (!split.hasNext()) { throw new IllegalArgumentException( String.format( "Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri)); } String stringValue = split.next(); Object value = fp.valueFromString(stringValue); values.add(value); } return org.kitesdk.data.impl.Accessor.getDefault() .newPartitionKey(values.toArray(new Object[values.size()])); }
@SuppressWarnings("unchecked") PartitionKey fromDirectoryName(Path dir) { final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(0); final List<Object> values = Lists.newArrayList(); if (partitionKey != null) { values.addAll(partitionKey.getValues()); } values.add(convert.valueForDirname(fp, dir.getName())); return Accessor.getDefault().newPartitionKey(values.toArray()); }
@Override @Nullable @Deprecated public Dataset<E> getPartition(PartitionKey key, boolean allowCreate) { Preconditions.checkState( descriptor.isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", name); logger.debug("Loading partition for key {}, allowCreate:{}", new Object[] {key, allowCreate}); Path partitionDirectory = fileSystem.makeQualified(toDirectoryName(directory, key)); try { if (!fileSystem.exists(partitionDirectory)) { if (allowCreate) { fileSystem.mkdirs(partitionDirectory); if (partitionListener != null) { partitionListener.partitionAdded(name, toRelativeDirectory(key).toString()); } } else { return null; } } } catch (IOException e) { throw new DatasetException( "Unable to locate or create dataset partition directory " + partitionDirectory, e); } int partitionDepth = key.getLength(); PartitionStrategy subpartitionStrategy = Accessor.getDefault().getSubpartitionStrategy(partitionStrategy, partitionDepth); return new FileSystemDataset.Builder() .name(name) .fileSystem(fileSystem) .descriptor( new DatasetDescriptor.Builder(descriptor) .location(partitionDirectory) .partitionStrategy(subpartitionStrategy) .build()) .partitionKey(key) .partitionListener(partitionListener) .build(); }
@Override @Deprecated public Iterable<Dataset<E>> getPartitions() { Preconditions.checkState( descriptor.isPartitioned(), "Attempt to get partitions on a non-partitioned dataset (name:%s)", name); List<Dataset<E>> partitions = Lists.newArrayList(); FileStatus[] fileStatuses; try { fileStatuses = fileSystem.listStatus(directory, PathFilters.notHidden()); } catch (IOException e) { throw new DatasetException( "Unable to list partition directory for directory " + directory, e); } for (FileStatus stat : fileStatuses) { Path p = fileSystem.makeQualified(stat.getPath()); PartitionKey key = fromDirectoryName(p); PartitionStrategy subPartitionStrategy = Accessor.getDefault().getSubpartitionStrategy(partitionStrategy, 1); Builder builder = new FileSystemDataset.Builder() .name(name) .fileSystem(fileSystem) .descriptor( new DatasetDescriptor.Builder(descriptor) .location(p) .partitionStrategy(subPartitionStrategy) .build()) .partitionKey(key) .partitionListener(partitionListener); partitions.add(builder.<E>build()); } return partitions; }
@Override public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(name != null, "Dataset name cannot be null"); Preconditions.checkArgument(descriptor != null, "DatasetDescriptro cannot be null"); DatasetDescriptor oldDescriptor = metadataProvider.load(name); // oldDescriptor is valid if load didn't throw NoSuchDatasetException if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) { throw new DatasetRepositoryException( "Cannot change dataset format from " + oldDescriptor.getFormat() + " to " + descriptor.getFormat()); } final URI oldLocation = oldDescriptor.getLocation(); if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) { throw new DatasetRepositoryException("Cannot change the dataset's location"); } if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) { throw new DatasetRepositoryException( "Cannot change an unpartitioned dataset to " + " partitioned or vice versa."); } else if (oldDescriptor.isPartitioned() && descriptor.isPartitioned() && !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) { throw new DatasetRepositoryException( "Cannot change partition strategy from " + oldDescriptor.getPartitionStrategy() + " to " + descriptor.getPartitionStrategy()); } // check can read records written with old schema using new schema final Schema oldSchema = oldDescriptor.getSchema(); final Schema newSchema = descriptor.getSchema(); if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) { throw new IncompatibleSchemaException( "New schema cannot read data " + "written using " + "old schema. New schema: " + newSchema.toString(true) + "\nOld schema: " + oldSchema.toString(true)); } DatasetDescriptor updatedDescriptor = metadataProvider.update(name, descriptor); updatedDescriptor = addRepositoryUri(updatedDescriptor); logger.debug( "Updated dataset:{} schema:{} datasetPath:{}", new Object[] { name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation().toString() }); return new FileSystemDataset.Builder() .name(name) .configuration(conf) .descriptor(updatedDescriptor) .partitionKey( updatedDescriptor.isPartitioned() ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .partitionListener(getPartitionListener()) .build(); }