Example #1
0
 @Override
 @SuppressWarnings({"unchecked", "deprecation"})
 protected void populateAvroHeaders(Map<String, String> hdrs, Schema schema, Object message) {
   if (!initialized) {
     // initialize here rather than in activateOptions to avoid initialization
     // cycle in Configuration and log4j
     try {
       URI datasetUri = new URIBuilder(datasetRepositoryUri, datasetName).build();
       Dataset dataset = Datasets.load(datasetUri);
       if (dataset.getDescriptor().isPartitioned()) {
         partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
       }
       URL schemaUrl = dataset.getDescriptor().getSchemaUrl();
       if (schemaUrl != null) {
         setAvroSchemaUrl(schemaUrl.toExternalForm());
       }
     } catch (Exception e) {
       throw new FlumeException(e);
     } finally {
       initialized = true;
     }
   }
   super.populateAvroHeaders(hdrs, schema, message);
   if (partitionStrategy != null) {
     key = PartitionKey.partitionKeyForEntity(partitionStrategy, message, key);
     int i = 0;
     for (FieldPartitioner fp : partitionStrategy.getFieldPartitioners()) {
       hdrs.put(PARTITION_PREFIX + fp.getName(), fp.valueToString(key.get(i++)));
     }
   }
 }
Example #2
0
 /** Construct a partition strategy with a list of field partitioners. */
 PartitionStrategy(List<FieldPartitioner> partitioners) {
   this.fieldPartitioners = ImmutableList.copyOf(partitioners);
   ImmutableMap.Builder<String, FieldPartitioner> mapBuilder = ImmutableMap.builder();
   for (FieldPartitioner fp : partitioners) {
     mapBuilder.put(fp.getName(), fp);
   }
   this.partitionerMap = mapBuilder.build();
 }
Example #3
0
 private void add(FieldPartitioner fp) {
   ValidationException.check(
       !names.contains(fp.getName()),
       "Partition name %s conflicts with an existing field or partition name",
       fp.getName());
   fieldPartitioners.add(fp);
   names.add(fp.getName());
 }
  /**
   * Get a {@link org.kitesdk.data.PartitionKey} corresponding to a partition's filesystem path
   * represented as a {@link URI}. If the path is not a valid partition, then {@link
   * IllegalArgumentException} is thrown. Note that the partition does not have to exist.
   *
   * @param dataset the filesystem dataset
   * @param partitionPath a directory path where the partition data is stored
   * @return a partition key representing the partition at the given path
   * @since 0.4.0
   */
  @SuppressWarnings("deprecation")
  public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) {
    Preconditions.checkState(
        dataset.getDescriptor().isPartitioned(),
        "Attempt to get a partition on a non-partitioned dataset (name:%s)",
        dataset.getName());

    Preconditions.checkArgument(
        dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset");
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;

    FileSystem fs = fsDataset.getFileSystem();
    URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri();
    URI directoryUri = fsDataset.getDirectory().toUri();
    URI relativizedUri = directoryUri.relativize(partitionUri);

    if (relativizedUri.equals(partitionUri)) {
      throw new IllegalArgumentException(
          String.format(
              "Partition URI %s has different " + "root directory to dataset (directory: %s).",
              partitionUri, directoryUri));
    }

    Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath());

    PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
    List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners();
    if (Iterables.size(parts) > fieldPartitioners.size()) {
      throw new IllegalArgumentException(
          String.format(
              "Too many partition directories " + "for %s (%s), expecting %s.",
              partitionUri, Iterables.size(parts), fieldPartitioners.size()));
    }

    List<Object> values = Lists.newArrayList();
    int i = 0;
    for (String part : parts) {
      Iterator<String> split = Splitter.on('=').split(part).iterator();
      String fieldName = split.next();
      FieldPartitioner fp = fieldPartitioners.get(i++);
      if (!fieldName.equals(fp.getName())) {
        throw new IllegalArgumentException(
            String.format(
                "Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.",
                fieldName, partitionUri, fp.getName()));
      }
      if (!split.hasNext()) {
        throw new IllegalArgumentException(
            String.format(
                "Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri));
      }
      String stringValue = split.next();
      Object value = fp.valueFromString(stringValue);
      values.add(value);
    }
    return org.kitesdk.data.impl.Accessor.getDefault()
        .newPartitionKey(values.toArray(new Object[values.size()]));
  }
Example #5
0
 /**
  * Return the cardinality produced by the contained field partitioners.
  *
  * <p>This can be used to aid in calculating resource usage during certain operations. For
  * example, when writing data to a partitioned dataset, you can use this method to estimate (or
  * discover exactly, depending on the partition functions) how many leaf partitions exist.
  *
  * <p><strong>Warning:</strong> This method is allowed to lie and should be treated only as a
  * hint. Some partition functions are fixed (for example, hash modulo number of buckets), while
  * others are open-ended (for example, discrete value) and depend on the input data.
  *
  * @return The estimated (or possibly concrete) number of leaf partitions.
  */
 public int getCardinality() {
   int cardinality = 1;
   for (FieldPartitioner fieldPartitioner : fieldPartitioners) {
     if (fieldPartitioner.getCardinality() == FieldPartitioner.UNKNOWN_CARDINALITY) {
       return FieldPartitioner.UNKNOWN_CARDINALITY;
     }
     cardinality *= fieldPartitioner.getCardinality();
   }
   return cardinality;
 }