@Override public int run() throws IOException { DatasetRepository repo = getDatasetRepository(); if (targets == null || targets.isEmpty()) { throw new IllegalArgumentException("No views or datasets were specified."); } for (String uriOrName : targets) { if (isViewUri(uriOrName)) { View view = Datasets.load(uriOrName); Preconditions.checkArgument( viewMatches(view.getUri(), uriOrName), "Resolved view does not match requested view: " + view.getUri()); view.deleteAll(); } else if (isDatasetUri(uriOrName)) { Datasets.delete(uriOrName); } else { repo.delete(namespace, uriOrName); } console.debug("Deleted {}", uriOrName); } return 0; }
@Test public void testEscapedURIs() { Datasets.delete("dataset:file:/tmp/datasets/string_partitioned"); // build a new dataset with a string partition field DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .partitionStrategy(new PartitionStrategy.Builder().identity("data", "d_copy").build()) .schema(TestRecord.class) .build(); FileSystemDataset<TestRecord> d = Datasets.create( "dataset:file:/tmp/datasets/string_partitioned", descriptor, TestRecord.class); writeTestRecords(d); FileSystemPartitionView<TestRecord> partition = d.getPartitionView(URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0")); Assert.assertEquals( "Should accept escaped full URI", URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"), partition.getLocation()); Assert.assertEquals( "Should should have correctly escaped relative URI", URI.create("d_copy=test%2F-0"), partition.getRelativeLocation()); Assert.assertEquals( "Should have correctly escaped constraints", d.unbounded.getConstraints().with("d_copy", "test/-0"), partition.getConstraints()); partition = d.getPartitionView(new Path("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0")); Assert.assertEquals( "Should accept escaped full URI", URI.create("file:/tmp/datasets/string_partitioned/d_copy=test%2F-0"), partition.getLocation()); Assert.assertEquals( "Should should have correctly escaped relative URI", URI.create("d_copy=test%2F-0"), partition.getRelativeLocation()); Assert.assertEquals( "Should have correctly escaped constraints", d.unbounded.getConstraints().with("d_copy", "test/-0"), partition.getConstraints()); Datasets.delete("dataset:file:/tmp/datasets/string_partitioned"); }
/** * Load a {@link Dataset} or {@link View} for the given {@link URI}. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. * * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view * URI, {@code load} returns a {@code View} configured to read a subset of the dataset. * * @param uri a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI */ @SuppressWarnings("unchecked") public static <E, V extends View<E>> V load(URI uri, Class<E> type) { boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument( isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull( type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); Dataset<E> dataset = repo.load( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), type); if (isView) { return Datasets.<E, V>view(dataset, uriOptions); } else { // if the URI isn't a view URI, only load the dataset return (V) dataset; } }
@Override public int run(String[] args) throws Exception { // Delete the users dataset boolean success = Datasets.delete("dataset:hive?dataset=users"); return success ? 0 : 1; }
/** * Create a {@link Dataset} for the given dataset or view URI. {@code create} returns an empty * dataset. You can use {@code DatasetWriter} to populate your dataset. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method * creates the underlying dataset and returns a view of it. * * @param uri a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code * null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <E, V extends View<E>> V create( URI uri, DatasetDescriptor descriptor, Class<E> type) { boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument( isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull( type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); if (descriptor.getLocation() == null && uriOptions.containsKey("location")) { descriptor = new DatasetDescriptor.Builder(descriptor).location(uriOptions.get("location")).build(); } Dataset<E> dataset = repo.create( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type); if (isView) { return Datasets.<E, V>view(dataset, uriOptions); } else { return (V) dataset; } }
@Override @SuppressWarnings({"unchecked", "deprecation"}) protected void populateAvroHeaders(Map<String, String> hdrs, Schema schema, Object message) { if (!initialized) { // initialize here rather than in activateOptions to avoid initialization // cycle in Configuration and log4j try { URI datasetUri = new URIBuilder(datasetRepositoryUri, datasetName).build(); Dataset dataset = Datasets.load(datasetUri); if (dataset.getDescriptor().isPartitioned()) { partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); } URL schemaUrl = dataset.getDescriptor().getSchemaUrl(); if (schemaUrl != null) { setAvroSchemaUrl(schemaUrl.toExternalForm()); } } catch (Exception e) { throw new FlumeException(e); } finally { initialized = true; } } super.populateAvroHeaders(hdrs, schema, message); if (partitionStrategy != null) { key = PartitionKey.partitionKeyForEntity(partitionStrategy, message, key); int i = 0; for (FieldPartitioner fp : partitionStrategy.getFieldPartitioners()) { hdrs.put(PARTITION_PREFIX + fp.getName(), fp.valueToString(key.get(i++))); } } }
@Override public int run(String[] args) throws Exception { // Drop the events dataset boolean success = Datasets.delete("dataset:hdfs:/tmp/data/events"); return success ? 0 : 1; }
@Test public void testDestroyForFailedJob() { // setup DestroyerContext context = new DestroyerContext(null, false, null, user); when(KiteDatasetExecutor.listTemporaryDatasetUris(toJobConfig.toJobConfig.uri)) .thenReturn(expectedUris); for (String uri : expectedUris) { when(Datasets.delete(uri)).thenReturn(true); } // exercise destroyer.destroy(context, linkConfig, toJobConfig); // verify for (String uri : expectedUris) { verifyStatic(times(1)); Datasets.delete(uri); } }
@Override public int run(String[] args) throws Exception { // Create a dataset of events with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri("resource:event.avsc").build(); Datasets.create("dataset:hive:/tmp/data/default/events", descriptor); return 0; }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(TestRecord.class).build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder().hash("id", 4).build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
@Override public int run() throws IOException { if (datasets == null || datasets.size() != 1) { throw new IllegalArgumentException("Exactly one dataset name must be specified."); } String dataset = datasets.remove(0); Dataset<GenericRecord> currentDataset = load(dataset).getDataset(); DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder(currentDataset.getDescriptor()); if (avroSchemaFile != null) { descriptorBuilder.schemaUri(qualifiedURI(avroSchemaFile)); } if (partitionStrategyFile != null) { descriptorBuilder.partitionStrategyUri(qualifiedURI(partitionStrategyFile)); } if (properties != null) { for (String propValue : properties) { Iterator<String> parts = PROP_VALUE_SEP.split(propValue).iterator(); descriptorBuilder.property(Iterators.getNext(parts, null), Iterators.getNext(parts, null)); } } DatasetDescriptor descriptor = descriptorBuilder.build(); if (isDatasetOrViewUri(dataset)) { Datasets.<GenericData.Record, Dataset<GenericData.Record>>update( dataset, descriptor, GenericData.Record.class); } else { getDatasetRepository().update(namespace, dataset, descriptor); } console.debug("Updated {}", dataset); return 0; }
@Test public void testViewUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writeTestUsers(inputDataset, 10); URI sourceViewUri = new URIBuilder(repo.getUri(), "ns", "in").with("username", "test-0").build(); View<Record> inputView = Datasets.<Record, Dataset<Record>>load(sourceViewUri, Record.class); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(sourceViewUri, GenericData.Record.class)); URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with("email", "email-0").build(); pipeline.write(data, CrunchDatasets.asTarget(targetViewUri), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
/** * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an * empty dataset. You can use {@code DatasetWriter} to populate your dataset. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method * creates the underlying dataset and returns a view of it. * * @param uri a {@code Dataset} or {@code View} URI string * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code * null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ public static <E, V extends View<E>> V create( String uri, DatasetDescriptor descriptor, Class<E> type) { return Datasets.<E, V>create(URI.create(uri), descriptor, type); }
/** * Load a {@link Dataset} or {@link View} for the given {@link URI}. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. * * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view * URI, {@code load} returns a {@code View} configured to read a subset of the dataset. * * @param uri a {@code Dataset} or {@code View} URI * @param <V> the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI */ @SuppressWarnings("unchecked") public static <V extends View<GenericRecord>> V load(URI uri) { return Datasets.<GenericRecord, V>load(uri, GenericRecord.class); }
@After public void removeTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); }
/** * Load a {@link Dataset} or {@link View} for the given {@link URI}. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. * * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view * URI, {@code load} returns a {@code View} configured to read a subset of the dataset. * * @param uriString a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI */ public static <E, V extends View<E>> V load(String uriString, Class<E> type) { return Datasets.<E, V>load(URI.create(uriString), type); }
/** * Update a {@link Dataset} for the given dataset or view URI string. * * <p>You can add columns, remove columns, or change the data type of columns in your dataset, * provided you don't attempt a change that is incompatible with written data. Avro defines rules * for compatible schema evolution. See <a * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>. * * <p>This method updates the dataset descriptor, so you can also add or change properties. * * <p>The recommended way to update a dataset descriptor is to build it based on an existing * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a * DatasetDescriptor based on an existing instance. * * <p>You cannot change a dataset format or partition strategy. * * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific, * depending on the dataset scheme. * * @param uri a {@code Dataset} URI string * @param <D> the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws UnsupportedOperationException if descriptor updates are not supported by the * implementation * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated * concurrently * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or * with existing datasets with shared storage (for example, in the same HBase table) */ public static <D extends Dataset<GenericRecord>> D update( String uri, DatasetDescriptor descriptor) { return Datasets.<GenericRecord, D>update(uri, descriptor, GenericRecord.class); }
/** * Update a {@link Dataset} for the given dataset or view URI string. * * <p>You can add columns, remove columns, or change the data type of columns in your dataset, * provided you don't attempt a change that is incompatible with written data. Avro defines rules * for compatible schema evolution. See <a * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>. * * <p>This method updates the dataset descriptor, so you can also add or change properties. * * <p>The recommended way to update a dataset descriptor is to build it based on an existing * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a * DatasetDescriptor based on an existing instance. * * <p>You cannot change a dataset format or partition strategy. * * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific, * depending on the dataset scheme. * * @param uri a {@code Dataset} URI string * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <D> the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code * null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws UnsupportedOperationException if descriptor updates are not supported by the * implementation * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated * concurrently * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or * with existing datasets with shared storage (for example, in the same HBase table) */ public static <E, D extends Dataset<E>> D update( String uri, DatasetDescriptor descriptor, Class<E> type) { return Datasets.<E, D>update(URI.create(uri), descriptor, type); }
/** * Update a {@link Dataset} for the given dataset or view URI. * * <p>You can add columns, remove columns, or change the data type of columns in your dataset, * provided you don't attempt a change that is incompatible with written data. Avro defines rules * for compatible schema evolution. See <a * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>. * * <p>This method updates the dataset descriptor, so you can also add or change properties. * * <p>The recommended way to update a dataset descriptor is to build it based on an existing * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a * DatasetDescriptor based on an existing instance. * * <p>You cannot change a dataset format or partition strategy. * * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific, * depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @param <D> the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws UnsupportedOperationException if descriptor updates are not supported by the * implementation * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated * concurrently * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or * with existing datasets with shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <D extends Dataset<GenericRecord>> D update(URI uri, DatasetDescriptor descriptor) { return Datasets.<GenericRecord, D>update(uri, descriptor, GenericRecord.class); }
/** * Create a {@link Dataset} for the given dataset or view URI string. {@code create} returns an * empty dataset. You can use {@code DatasetWriter} to populate your dataset. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method * creates the underlying dataset and returns a view of it. * * @param uri a {@code Dataset} or {@code View} URI string * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <V extends View<GenericRecord>> V create(String uri, DatasetDescriptor descriptor) { return Datasets.<GenericRecord, V>create(uri, descriptor, GenericRecord.class); }
/** * Load a {@link Dataset} or {@link View} for the given {@link URI}. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. * * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view * URI, {@code load} returns a {@code View} configured to read a subset of the dataset. * * @param uriString a {@code Dataset} or {@code View} URI * @param <V> the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI */ public static <V extends View<GenericRecord>> V load(String uriString) { return Datasets.<GenericRecord, V>load(uriString, GenericRecord.class); }