@Override public int run(String[] args) throws Exception { // Create a dataset of events with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri("resource:event.avsc").build(); Datasets.create("dataset:hive:/tmp/data/default/events", descriptor); return 0; }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(TestRecord.class).build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder().hash("id", 4).build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
@Test public void testEscapedURIs() { Datasets.delete("dataset:file:/tmp/datasets/string_partitioned"); // build a new dataset with a string partition field DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .partitionStrategy(new PartitionStrategy.Builder().identity("data", "d_copy").build()) .schema(TestRecord.class) .build(); FileSystemDataset<TestRecord> d = Datasets.create( "dataset:file:/tmp/datasets/string_partitioned", descriptor, TestRecord.class); writeTestRecords(d); FileSystemPartitionView<TestRecord> partition = d.getPartitionView(URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0")); Assert.assertEquals( "Should accept escaped full URI", URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"), partition.getLocation()); Assert.assertEquals( "Should should have correctly escaped relative URI", URI.create("d_copy=test%2F-0"), partition.getRelativeLocation()); Assert.assertEquals( "Should have correctly escaped constraints", d.unbounded.getConstraints().with("d_copy", "test/-0"), partition.getConstraints()); partition = d.getPartitionView(new Path("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0")); Assert.assertEquals( "Should accept escaped full URI", URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"), partition.getLocation()); Assert.assertEquals( "Should should have correctly escaped relative URI", URI.create("d_copy=test%2F-0"), partition.getRelativeLocation()); Assert.assertEquals( "Should have correctly escaped constraints", d.unbounded.getConstraints().with("d_copy", "test/-0"), partition.getConstraints()); Datasets.delete("dataset:file:/tmp/datasets/string_partitioned"); }
/** * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an * empty dataset. You can use {@code DatasetWriter} to populate your dataset. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method * creates the underlying dataset and returns a view of it. * * @param uri a {@code Dataset} or {@code View} URI string * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <V extends View<GenericRecord>> V create(String uri, DatasetDescriptor descriptor) { return Datasets.<GenericRecord, V>create(uri, descriptor, GenericRecord.class); }
/** * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an * empty dataset. You can use {@code DatasetWriter} to populate your dataset. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method * creates the underlying dataset and returns a view of it. * * @param uri a {@code Dataset} or {@code View} URI string * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code * null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ public static <E, V extends View<E>> V create( String uri, DatasetDescriptor descriptor, Class<E> type) { return Datasets.<E, V>create(URI.create(uri), descriptor, type); }