@Override public void merge(FileSystemDataset<E> update) { DatasetDescriptor updateDescriptor = update.getDescriptor(); if (!updateDescriptor.getFormat().equals(descriptor.getFormat())) { throw new DatasetRepositoryException( "Cannot merge dataset format " + updateDescriptor.getFormat() + " with format " + descriptor.getFormat()); } if (updateDescriptor.isPartitioned() != descriptor.isPartitioned()) { throw new DatasetRepositoryException( "Cannot merge an unpartitioned dataset with a " + " partitioned one or vice versa."); } else if (updateDescriptor.isPartitioned() && descriptor.isPartitioned() && !updateDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) { throw new DatasetRepositoryException( "Cannot merge dataset partition strategy " + updateDescriptor.getPartitionStrategy() + " with " + descriptor.getPartitionStrategy()); } if (!updateDescriptor.getSchema().equals(descriptor.getSchema())) { throw new DatasetRepositoryException( "Cannot merge dataset schema " + updateDescriptor.getFormat() + " with schema " + descriptor.getFormat()); } Set<String> addedPartitions = Sets.newHashSet(); for (Path path : update.pathIterator()) { URI relativePath = update.getDirectory().toUri().relativize(path.toUri()); Path newPath = new Path(directory, new Path(relativePath)); Path newPartitionDirectory = newPath.getParent(); try { if (!fileSystem.exists(newPartitionDirectory)) { fileSystem.mkdirs(newPartitionDirectory); } logger.debug("Renaming {} to {}", path, newPath); boolean renameOk = fileSystem.rename(path, newPath); if (!renameOk) { throw new DatasetException( "Dataset merge failed during rename of " + path + " to " + newPath); } } catch (IOException e) { throw new DatasetIOException("Dataset merge failed", e); } if (descriptor.isPartitioned() && partitionListener != null) { String partition = newPartitionDirectory.toString(); if (!addedPartitions.contains(partition)) { partitionListener.partitionAdded(name, partition); addedPartitions.add(partition); } } } }
/** * Get a {@link org.kitesdk.data.PartitionKey} corresponding to a partition's filesystem path * represented as a {@link URI}. If the path is not a valid partition, then {@link * IllegalArgumentException} is thrown. Note that the partition does not have to exist. * * @param dataset the filesystem dataset * @param partitionPath a directory path where the partition data is stored * @return a partition key representing the partition at the given path * @since 0.4.0 */ @SuppressWarnings("deprecation") public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) { Preconditions.checkState( dataset.getDescriptor().isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName()); Preconditions.checkArgument( dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset"); FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileSystem fs = fsDataset.getFileSystem(); URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri(); URI directoryUri = fsDataset.getDirectory().toUri(); URI relativizedUri = directoryUri.relativize(partitionUri); if (relativizedUri.equals(partitionUri)) { throw new IllegalArgumentException( String.format( "Partition URI %s has different " + "root directory to dataset (directory: %s).", partitionUri, directoryUri)); } Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath()); PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners(); if (Iterables.size(parts) > fieldPartitioners.size()) { throw new IllegalArgumentException( String.format( "Too many partition directories " + "for %s (%s), expecting %s.", partitionUri, Iterables.size(parts), fieldPartitioners.size())); } List<Object> values = Lists.newArrayList(); int i = 0; for (String part : parts) { Iterator<String> split = Splitter.on('=').split(part).iterator(); String fieldName = split.next(); FieldPartitioner fp = fieldPartitioners.get(i++); if (!fieldName.equals(fp.getName())) { throw new IllegalArgumentException( String.format( "Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.", fieldName, partitionUri, fp.getName())); } if (!split.hasNext()) { throw new IllegalArgumentException( String.format( "Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri)); } String stringValue = split.next(); Object value = fp.valueFromString(stringValue); values.add(value); } return org.kitesdk.data.impl.Accessor.getDefault() .newPartitionKey(values.toArray(new Object[values.size()])); }
@Override public DatasetWriter<E> load(StorageKey key) throws Exception { Preconditions.checkState( view.getDataset() instanceof FileSystemDataset, "FileSystemWriters cannot create writer for " + view.getDataset()); FileSystemDataset dataset = (FileSystemDataset) view.getDataset(); Path partition = convert.fromKey(key); DatasetWriter<E> writer = new FileSystemWriter<E>( dataset.getFileSystem(), new Path(dataset.getDirectory(), partition), dataset.getDescriptor()); PartitionListener listener = dataset.getPartitionListener(); if (listener != null) { listener.partitionAdded(dataset.getName(), partition.toString()); } writer.open(); return writer; }