Example #1
0
  @Override
  public int run(String[] args) throws Exception {

    // Create a dataset of events with the Avro schema
    DatasetDescriptor descriptor =
        new DatasetDescriptor.Builder().schemaUri("resource:event.avsc").build();
    Datasets.create("dataset:hive:/tmp/data/default/events", descriptor);

    return 0;
  }
  @Before
  public void createTestDatasets() {
    Datasets.delete("dataset:file:/tmp/datasets/unpartitioned");
    Datasets.delete("dataset:file:/tmp/datasets/partitioned");

    DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(TestRecord.class).build();
    unpartitioned =
        Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class);

    descriptor =
        new DatasetDescriptor.Builder(descriptor)
            .partitionStrategy(new PartitionStrategy.Builder().hash("id", 4).build())
            .build();
    partitioned =
        Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class);

    writeTestRecords(unpartitioned);
    writeTestRecords(partitioned);
  }
  @Test
  public void testEscapedURIs() {
    Datasets.delete("dataset:file:/tmp/datasets/string_partitioned");

    // build a new dataset with a string partition field
    DatasetDescriptor descriptor =
        new DatasetDescriptor.Builder()
            .partitionStrategy(new PartitionStrategy.Builder().identity("data", "d_copy").build())
            .schema(TestRecord.class)
            .build();

    FileSystemDataset<TestRecord> d =
        Datasets.create(
            "dataset:file:/tmp/datasets/string_partitioned", descriptor, TestRecord.class);

    writeTestRecords(d);

    FileSystemPartitionView<TestRecord> partition =
        d.getPartitionView(URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"));
    Assert.assertEquals(
        "Should accept escaped full URI",
        URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"),
        partition.getLocation());
    Assert.assertEquals(
        "Should should have correctly escaped relative URI",
        URI.create("d_copy=test%2F-0"),
        partition.getRelativeLocation());
    Assert.assertEquals(
        "Should have correctly escaped constraints",
        d.unbounded.getConstraints().with("d_copy", "test/-0"),
        partition.getConstraints());

    partition =
        d.getPartitionView(new Path("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"));
    Assert.assertEquals(
        "Should accept escaped full URI",
        URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"),
        partition.getLocation());
    Assert.assertEquals(
        "Should should have correctly escaped relative URI",
        URI.create("d_copy=test%2F-0"),
        partition.getRelativeLocation());
    Assert.assertEquals(
        "Should have correctly escaped constraints",
        d.unbounded.getConstraints().with("d_copy", "test/-0"),
        partition.getConstraints());

    Datasets.delete("dataset:file:/tmp/datasets/string_partitioned");
  }
Example #4
0
 /**
  * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an
  * empty dataset. You can use {@code DatasetWriter} to populate your dataset.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
  * creates the underlying dataset and returns a view of it.
  *
  * @param uri a {@code Dataset} or {@code View} URI string
  * @param <V> the type of {@code Dataset} or {@code View} expected
  * @return a newly created {@code Dataset} responsible for the given URI
  * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
  * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
  *     shared storage (for example, in the same HBase table)
  */
 @SuppressWarnings("unchecked")
 public static <V extends View<GenericRecord>> V create(String uri, DatasetDescriptor descriptor) {
   return Datasets.<GenericRecord, V>create(uri, descriptor, GenericRecord.class);
 }
Example #5
0
 /**
  * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an
  * empty dataset. You can use {@code DatasetWriter} to populate your dataset.
  *
  * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
  * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
  * creates the underlying dataset and returns a view of it.
  *
  * @param uri a {@code Dataset} or {@code View} URI string
  * @param type a Java class that represents an entity in the dataset
  * @param <E> the type used for readers and writers created by this {@code Dataset}
  * @param <V> the type of {@code Dataset} or {@code View} expected
  * @return a newly created {@code Dataset} responsible for the given URI
  * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
  *     null}
  * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
  * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
  * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
  *     shared storage (for example, in the same HBase table)
  */
 public static <E, V extends View<E>> V create(
     String uri, DatasetDescriptor descriptor, Class<E> type) {
   return Datasets.<E, V>create(URI.create(uri), descriptor, type);
 }