Beispiel #1
0
  @Override
  public int run() throws IOException {
    DatasetRepository repo = getDatasetRepository();

    if (targets == null || targets.isEmpty()) {
      throw new IllegalArgumentException("No views or datasets were specified.");
    }

    for (String uriOrName : targets) {
      if (isViewUri(uriOrName)) {
        View view = Datasets.load(uriOrName);
        Preconditions.checkArgument(
            viewMatches(view.getUri(), uriOrName),
            "Resolved view does not match requested view: " + view.getUri());
        view.deleteAll();
      } else if (isDatasetUri(uriOrName)) {
        Datasets.delete(uriOrName);
      } else {
        repo.delete(namespace, uriOrName);
      }
      console.debug("Deleted {}", uriOrName);
    }

    return 0;
  }
  @Test
  public void testEscapedURIs() {
    Datasets.delete("dataset:file:/tmp/datasets/string_partitioned");

    // build a new dataset with a string partition field
    DatasetDescriptor descriptor =
        new DatasetDescriptor.Builder()
            .partitionStrategy(new PartitionStrategy.Builder().identity("data", "d_copy").build())
            .schema(TestRecord.class)
            .build();

    FileSystemDataset<TestRecord> d =
        Datasets.create(
            "dataset:file:/tmp/datasets/string_partitioned", descriptor, TestRecord.class);

    writeTestRecords(d);

    FileSystemPartitionView<TestRecord> partition =
        d.getPartitionView(URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"));
    Assert.assertEquals(
        "Should accept escaped full URI",
        URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"),
        partition.getLocation());
    Assert.assertEquals(
        "Should should have correctly escaped relative URI",
        URI.create("d_copy=test%2F-0"),
        partition.getRelativeLocation());
    Assert.assertEquals(
        "Should have correctly escaped constraints",
        d.unbounded.getConstraints().with("d_copy", "test/-0"),
        partition.getConstraints());

    partition =
        d.getPartitionView(new Path("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"));
    Assert.assertEquals(
        "Should accept escaped full URI",
        URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"),
        partition.getLocation());
    Assert.assertEquals(
        "Should should have correctly escaped relative URI",
        URI.create("d_copy=test%2F-0"),
        partition.getRelativeLocation());
    Assert.assertEquals(
        "Should have correctly escaped constraints",
        d.unbounded.getConstraints().with("d_copy", "test/-0"),
        partition.getConstraints());

    Datasets.delete("dataset:file:/tmp/datasets/string_partitioned");
  }
Beispiel #3
0
  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme.
   *
   * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view
   * URI, {@code load} returns a {@code View} configured to read a subset of the dataset.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V load(URI uri, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    Dataset<E> dataset =
        repo.load(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      // if the URI isn't a view URI, only load the dataset
      return (V) dataset;
    }
  }
  @Override
  public int run(String[] args) throws Exception {
    // Delete the users dataset
    boolean success = Datasets.delete("dataset:hive?dataset=users");

    return success ? 0 : 1;
  }
Beispiel #5
0
  /**
   * Create a {@link Dataset} for the given dataset or view URI. {@code create} returns an empty
   * dataset. You can use {@code DatasetWriter} to populate your dataset.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
   * creates the underlying dataset and returns a view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
   *     null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
   *     shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V create(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
      descriptor =
          new DatasetDescriptor.Builder(descriptor).location(uriOptions.get("location")).build();
    }

    Dataset<E> dataset =
        repo.create(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            descriptor,
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      return (V) dataset;
    }
  }
Beispiel #6
0
 @Override
 @SuppressWarnings({"unchecked", "deprecation"})
 protected void populateAvroHeaders(Map<String, String> hdrs, Schema schema, Object message) {
   if (!initialized) {
     // initialize here rather than in activateOptions to avoid initialization
     // cycle in Configuration and log4j
     try {
       URI datasetUri = new URIBuilder(datasetRepositoryUri, datasetName).build();
       Dataset dataset = Datasets.load(datasetUri);
       if (dataset.getDescriptor().isPartitioned()) {
         partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
       }
       URL schemaUrl = dataset.getDescriptor().getSchemaUrl();
       if (schemaUrl != null) {
         setAvroSchemaUrl(schemaUrl.toExternalForm());
       }
     } catch (Exception e) {
       throw new FlumeException(e);
     } finally {
       initialized = true;
     }
   }
   super.populateAvroHeaders(hdrs, schema, message);
   if (partitionStrategy != null) {
     key = PartitionKey.partitionKeyForEntity(partitionStrategy, message, key);
     int i = 0;
     for (FieldPartitioner fp : partitionStrategy.getFieldPartitioners()) {
       hdrs.put(PARTITION_PREFIX + fp.getName(), fp.valueToString(key.get(i++)));
     }
   }
 }
Beispiel #7
0
  @Override
  public int run(String[] args) throws Exception {

    // Drop the events dataset
    boolean success = Datasets.delete("dataset:hdfs:/tmp/data/events");

    return success ? 0 : 1;
  }
  @Test
  public void testDestroyForFailedJob() {
    // setup
    DestroyerContext context = new DestroyerContext(null, false, null, user);
    when(KiteDatasetExecutor.listTemporaryDatasetUris(toJobConfig.toJobConfig.uri))
        .thenReturn(expectedUris);
    for (String uri : expectedUris) {
      when(Datasets.delete(uri)).thenReturn(true);
    }

    // exercise
    destroyer.destroy(context, linkConfig, toJobConfig);

    // verify
    for (String uri : expectedUris) {
      verifyStatic(times(1));
      Datasets.delete(uri);
    }
  }
  @Override
  public int run(String[] args) throws Exception {

    // Create a dataset of events with the Avro schema
    DatasetDescriptor descriptor =
        new DatasetDescriptor.Builder().schemaUri("resource:event.avsc").build();
    Datasets.create("dataset:hive:/tmp/data/default/events", descriptor);

    return 0;
  }
  @Before
  public void createTestDatasets() {
    Datasets.delete("dataset:file:/tmp/datasets/unpartitioned");
    Datasets.delete("dataset:file:/tmp/datasets/partitioned");

    DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(TestRecord.class).build();
    unpartitioned =
        Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class);

    descriptor =
        new DatasetDescriptor.Builder(descriptor)
            .partitionStrategy(new PartitionStrategy.Builder().hash("id", 4).build())
            .build();
    partitioned =
        Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class);

    writeTestRecords(unpartitioned);
    writeTestRecords(partitioned);
  }
  @Override
  public int run() throws IOException {
    if (datasets == null || datasets.size() != 1) {
      throw new IllegalArgumentException("Exactly one dataset name must be specified.");
    }

    String dataset = datasets.remove(0);
    Dataset<GenericRecord> currentDataset = load(dataset).getDataset();

    DatasetDescriptor.Builder descriptorBuilder =
        new DatasetDescriptor.Builder(currentDataset.getDescriptor());

    if (avroSchemaFile != null) {
      descriptorBuilder.schemaUri(qualifiedURI(avroSchemaFile));
    }

    if (partitionStrategyFile != null) {
      descriptorBuilder.partitionStrategyUri(qualifiedURI(partitionStrategyFile));
    }

    if (properties != null) {
      for (String propValue : properties) {
        Iterator<String> parts = PROP_VALUE_SEP.split(propValue).iterator();
        descriptorBuilder.property(Iterators.getNext(parts, null), Iterators.getNext(parts, null));
      }
    }

    DatasetDescriptor descriptor = descriptorBuilder.build();

    if (isDatasetOrViewUri(dataset)) {
      Datasets.<GenericData.Record, Dataset<GenericData.Record>>update(
          dataset, descriptor, GenericData.Record.class);
    } else {
      getDatasetRepository().update(namespace, dataset, descriptor);
    }

    console.debug("Updated {}", dataset);

    return 0;
  }
Beispiel #12
0
  @Test
  public void testViewUris() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 10);

    URI sourceViewUri =
        new URIBuilder(repo.getUri(), "ns", "in").with("username", "test-0").build();
    View<Record> inputView = Datasets.<Record, Dataset<Record>>load(sourceViewUri, Record.class);
    Assert.assertEquals(1, datasetSize(inputView));

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data =
        pipeline.read(CrunchDatasets.asSource(sourceViewUri, GenericData.Record.class));
    URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with("email", "email-0").build();
    pipeline.write(data, CrunchDatasets.asTarget(targetViewUri), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
  }
Beispiel #13
0
 /**
  * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an
  * empty dataset. You can use {@code DatasetWriter} to populate your dataset.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
  * creates the underlying dataset and returns a view of it.
  *
  * @param uri a {@code Dataset} or {@code View} URI string
  * @param type a Java class that represents an entity in the dataset
  * @param <E> the type used for readers and writers created by this {@code Dataset}
  * @param <V> the type of {@code Dataset} or {@code View} expected
  * @return a newly created {@code Dataset} responsible for the given URI
  * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
  *     null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
  * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
  *     shared storage (for example, in the same HBase table)
  */
 public static <E, V extends View<E>> V create(
     String uri, DatasetDescriptor descriptor, Class<E> type) {
   return Datasets.<E, V>create(URI.create(uri), descriptor, type);
 }
Beispiel #14
0
 /**
  * Load a {@link Dataset} or {@link View} for the given {@link URI}.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme.
  *
  * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view
  * URI, {@code load} returns a {@code View} configured to read a subset of the dataset.
  *
  * @param uri a {@code Dataset} or {@code View} URI
  * @param <V> the type of {@code View} expected
  * @return a {@code View} for the given URI
  * @throws DatasetNotFoundException if there is no dataset for the given URI
  * @throws NullPointerException if any arguments are {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  */
 @SuppressWarnings("unchecked")
 public static <V extends View<GenericRecord>> V load(URI uri) {
   return Datasets.<GenericRecord, V>load(uri, GenericRecord.class);
 }
 @After
 public void removeTestDatasets() {
   Datasets.delete("dataset:file:/tmp/datasets/unpartitioned");
   Datasets.delete("dataset:file:/tmp/datasets/partitioned");
 }
Beispiel #16
0
 /**
  * Load a {@link Dataset} or {@link View} for the given {@link URI}.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme.
  *
  * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view
  * URI, {@code load} returns a {@code View} configured to read a subset of the dataset.
  *
  * @param uriString a {@code Dataset} or {@code View} URI
  * @param type a Java class that represents an entity in the dataset
  * @param <E> the type used for readers and writers created by this {@code Dataset}
  * @param <V> the type of {@code View} expected
  * @return a {@code View} for the given URI
  * @throws DatasetNotFoundException if there is no dataset for the given URI
  * @throws NullPointerException if any arguments are {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  */
 public static <E, V extends View<E>> V load(String uriString, Class<E> type) {
   return Datasets.<E, V>load(URI.create(uriString), type);
 }
Beispiel #17
0
 /**
  * Update a {@link Dataset} for the given dataset or view URI string.
  *
  * <p>You can add columns, remove columns, or change the data type of columns in your dataset,
  * provided you don't attempt a change that is incompatible with written data. Avro defines rules
  * for compatible schema evolution. See <a
  * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>.
  *
  * <p>This method updates the dataset descriptor, so you can also add or change properties.
  *
  * <p>The recommended way to update a dataset descriptor is to build it based on an existing
  * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a
  * DatasetDescriptor based on an existing instance.
  *
  * <p>You cannot change a dataset format or partition strategy.
  *
  * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
  * depending on the dataset scheme.
  *
  * @param uri a {@code Dataset} URI string
  * @param <D> the type of {@code Dataset} expected
  * @return a {@code Dataset} for the given URI
  * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset URI
  * @throws DatasetNotFoundException if there is no dataset for the given URI
  * @throws UnsupportedOperationException if descriptor updates are not supported by the
  *     implementation
  * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated
  *     concurrently
  * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or
  *     with existing datasets with shared storage (for example, in the same HBase table)
  */
 public static <D extends Dataset<GenericRecord>> D update(
     String uri, DatasetDescriptor descriptor) {
   return Datasets.<GenericRecord, D>update(uri, descriptor, GenericRecord.class);
 }
Beispiel #18
0
 /**
  * Update a {@link Dataset} for the given dataset or view URI string.
  *
  * <p>You can add columns, remove columns, or change the data type of columns in your dataset,
  * provided you don't attempt a change that is incompatible with written data. Avro defines rules
  * for compatible schema evolution. See <a
  * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>.
  *
  * <p>This method updates the dataset descriptor, so you can also add or change properties.
  *
  * <p>The recommended way to update a dataset descriptor is to build it based on an existing
  * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a
  * DatasetDescriptor based on an existing instance.
  *
  * <p>You cannot change a dataset format or partition strategy.
  *
  * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
  * depending on the dataset scheme.
  *
  * @param uri a {@code Dataset} URI string
  * @param type a Java class that represents an entity in the dataset
  * @param <E> the type used for readers and writers created by this {@code Dataset}
  * @param <D> the type of {@code Dataset} expected
  * @return a {@code Dataset} for the given URI
  * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
  *     null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset URI
  * @throws DatasetNotFoundException if there is no dataset for the given URI
  * @throws UnsupportedOperationException if descriptor updates are not supported by the
  *     implementation
  * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated
  *     concurrently
  * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or
  *     with existing datasets with shared storage (for example, in the same HBase table)
  */
 public static <E, D extends Dataset<E>> D update(
     String uri, DatasetDescriptor descriptor, Class<E> type) {
   return Datasets.<E, D>update(URI.create(uri), descriptor, type);
 }
Beispiel #19
0
 /**
  * Update a {@link Dataset} for the given dataset or view URI.
  *
  * <p>You can add columns, remove columns, or change the data type of columns in your dataset,
  * provided you don't attempt a change that is incompatible with written data. Avro defines rules
  * for compatible schema evolution. See <a
  * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>.
  *
  * <p>This method updates the dataset descriptor, so you can also add or change properties.
  *
  * <p>The recommended way to update a dataset descriptor is to build it based on an existing
  * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a
  * DatasetDescriptor based on an existing instance.
  *
  * <p>You cannot change a dataset format or partition strategy.
  *
  * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
  * depending on the dataset scheme.
  *
  * @param uri a {@code Dataset} URI
  * @param <D> the type of {@code Dataset} expected
  * @return a {@code Dataset} for the given URI
  * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset URI
  * @throws DatasetNotFoundException if there is no dataset for the given URI
  * @throws UnsupportedOperationException if descriptor updates are not supported by the
  *     implementation
  * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated
  *     concurrently
  * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or
  *     with existing datasets with shared storage (for example, in the same HBase table)
  */
 @SuppressWarnings("unchecked")
 public static <D extends Dataset<GenericRecord>> D update(URI uri, DatasetDescriptor descriptor) {
   return Datasets.<GenericRecord, D>update(uri, descriptor, GenericRecord.class);
 }
Beispiel #20
0
 /**
  * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an
  * empty dataset. You can use {@code DatasetWriter} to populate your dataset.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
  * creates the underlying dataset and returns a view of it.
  *
  * @param uri a {@code Dataset} or {@code View} URI string
  * @param <V> the type of {@code Dataset} or {@code View} expected
  * @return a newly created {@code Dataset} responsible for the given URI
  * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
  * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
  *     shared storage (for example, in the same HBase table)
  */
 @SuppressWarnings("unchecked")
 public static <V extends View<GenericRecord>> V create(String uri, DatasetDescriptor descriptor) {
   return Datasets.<GenericRecord, V>create(uri, descriptor, GenericRecord.class);
 }
Beispiel #21
0
 /**
  * Load a {@link Dataset} or {@link View} for the given {@link URI}.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme.
  *
  * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view
  * URI, {@code load} returns a {@code View} configured to read a subset of the dataset.
  *
  * @param uriString a {@code Dataset} or {@code View} URI
  * @param <V> the type of {@code View} expected
  * @return a {@code View} for the given URI
  * @throws DatasetNotFoundException if there is no dataset for the given URI
  * @throws NullPointerException if any arguments are {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  */
 public static <V extends View<GenericRecord>> V load(String uriString) {
   return Datasets.<GenericRecord, V>load(uriString, GenericRecord.class);
 }