@Override @SuppressWarnings({"unchecked", "deprecation"}) protected void populateAvroHeaders(Map<String, String> hdrs, Schema schema, Object message) { if (!initialized) { // initialize here rather than in activateOptions to avoid initialization // cycle in Configuration and log4j try { URI datasetUri = new URIBuilder(datasetRepositoryUri, datasetName).build(); Dataset dataset = Datasets.load(datasetUri); if (dataset.getDescriptor().isPartitioned()) { partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); } URL schemaUrl = dataset.getDescriptor().getSchemaUrl(); if (schemaUrl != null) { setAvroSchemaUrl(schemaUrl.toExternalForm()); } } catch (Exception e) { throw new FlumeException(e); } finally { initialized = true; } } super.populateAvroHeaders(hdrs, schema, message); if (partitionStrategy != null) { key = PartitionKey.partitionKeyForEntity(partitionStrategy, message, key); int i = 0; for (FieldPartitioner fp : partitionStrategy.getFieldPartitioners()) { hdrs.put(PARTITION_PREFIX + fp.getName(), fp.valueToString(key.get(i++))); } } }
/** Construct a partition strategy with a list of field partitioners. */ PartitionStrategy(List<FieldPartitioner> partitioners) { this.fieldPartitioners = ImmutableList.copyOf(partitioners); ImmutableMap.Builder<String, FieldPartitioner> mapBuilder = ImmutableMap.builder(); for (FieldPartitioner fp : partitioners) { mapBuilder.put(fp.getName(), fp); } this.partitionerMap = mapBuilder.build(); }
private void add(FieldPartitioner fp) { ValidationException.check( !names.contains(fp.getName()), "Partition name %s conflicts with an existing field or partition name", fp.getName()); fieldPartitioners.add(fp); names.add(fp.getName()); }
/** * Get a {@link org.kitesdk.data.PartitionKey} corresponding to a partition's filesystem path * represented as a {@link URI}. If the path is not a valid partition, then {@link * IllegalArgumentException} is thrown. Note that the partition does not have to exist. * * @param dataset the filesystem dataset * @param partitionPath a directory path where the partition data is stored * @return a partition key representing the partition at the given path * @since 0.4.0 */ @SuppressWarnings("deprecation") public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) { Preconditions.checkState( dataset.getDescriptor().isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName()); Preconditions.checkArgument( dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset"); FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileSystem fs = fsDataset.getFileSystem(); URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri(); URI directoryUri = fsDataset.getDirectory().toUri(); URI relativizedUri = directoryUri.relativize(partitionUri); if (relativizedUri.equals(partitionUri)) { throw new IllegalArgumentException( String.format( "Partition URI %s has different " + "root directory to dataset (directory: %s).", partitionUri, directoryUri)); } Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath()); PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners(); if (Iterables.size(parts) > fieldPartitioners.size()) { throw new IllegalArgumentException( String.format( "Too many partition directories " + "for %s (%s), expecting %s.", partitionUri, Iterables.size(parts), fieldPartitioners.size())); } List<Object> values = Lists.newArrayList(); int i = 0; for (String part : parts) { Iterator<String> split = Splitter.on('=').split(part).iterator(); String fieldName = split.next(); FieldPartitioner fp = fieldPartitioners.get(i++); if (!fieldName.equals(fp.getName())) { throw new IllegalArgumentException( String.format( "Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.", fieldName, partitionUri, fp.getName())); } if (!split.hasNext()) { throw new IllegalArgumentException( String.format( "Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri)); } String stringValue = split.next(); Object value = fp.valueFromString(stringValue); values.add(value); } return org.kitesdk.data.impl.Accessor.getDefault() .newPartitionKey(values.toArray(new Object[values.size()])); }
/** * Return the cardinality produced by the contained field partitioners. * * <p>This can be used to aid in calculating resource usage during certain operations. For * example, when writing data to a partitioned dataset, you can use this method to estimate (or * discover exactly, depending on the partition functions) how many leaf partitions exist. * * <p><strong>Warning:</strong> This method is allowed to lie and should be treated only as a * hint. Some partition functions are fixed (for example, hash modulo number of buckets), while * others are open-ended (for example, discrete value) and depend on the input data. * * @return The estimated (or possibly concrete) number of leaf partitions. */ public int getCardinality() { int cardinality = 1; for (FieldPartitioner fieldPartitioner : fieldPartitioners) { if (fieldPartitioner.getCardinality() == FieldPartitioner.UNKNOWN_CARDINALITY) { return FieldPartitioner.UNKNOWN_CARDINALITY; } cardinality *= fieldPartitioner.getCardinality(); } return cardinality; }