@Test public void testSignalReadyOutputView() { Assume.assumeTrue(!Hadoop.isHadoop1()); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-8", "test-9"); View<Record> outputView = outputDataset.with("username", "test-8", "test-9"); Assert.assertEquals(2, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(2, datasetSize(outputView)); Assert.assertFalse( "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady()); }
@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
/** * Load a {@link Dataset} or {@link View} for the given {@link URI}. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. * * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view * URI, {@code load} returns a {@code View} configured to read a subset of the dataset. * * @param uri a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI */ @SuppressWarnings("unchecked") public static <E, V extends View<E>> V load(URI uri, Class<E> type) { boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument( isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull( type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); Dataset<E> dataset = repo.load( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), type); if (isView) { return Datasets.<E, V>view(dataset, uriOptions); } else { // if the URI isn't a view URI, only load the dataset return (V) dataset; } }
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
/** * Create a {@link Dataset} for the given dataset or view URI. {@code create} returns an empty * dataset. You can use {@code DatasetWriter} to populate your dataset. * * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method * creates the underlying dataset and returns a view of it. * * @param uri a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <V> the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code * null} * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <E, V extends View<E>> V create( URI uri, DatasetDescriptor descriptor, Class<E> type) { boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument( isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull( type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); if (descriptor.getLocation() == null && uriOptions.containsKey("location")) { descriptor = new DatasetDescriptor.Builder(descriptor).location(uriOptions.get("location")).build(); } Dataset<E> dataset = repo.create( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type); if (isView) { return Datasets.<E, V>view(dataset, uriOptions); } else { return (V) dataset; } }
@Override public int run() throws IOException { DatasetRepository repo = getDatasetRepository(); if (targets == null || targets.isEmpty()) { throw new IllegalArgumentException("No views or datasets were specified."); } for (String uriOrName : targets) { if (isViewUri(uriOrName)) { View view = Datasets.load(uriOrName); Preconditions.checkArgument( viewMatches(view.getUri(), uriOrName), "Resolved view does not match requested view: " + view.getUri()); view.deleteAll(); } else if (isDatasetUri(uriOrName)) { Datasets.delete(uriOrName); } else { repo.delete(namespace, uriOrName); } console.debug("Deleted {}", uriOrName); } return 0; }
@Test public void testUseReaderSchema() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
@Test public void testWriteModeCheckpointToNotReadyOutput() throws Exception { // identity partition so we can overwrite the output PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().identity("username").build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writeTestUsers(inputDataset, 1, 0); // ensure output is newer than input on local filesystems with 1s granularity Thread.sleep(1000); runCheckpointPipeline(inputDataset, outputDataset); checkTestUsers(outputDataset, 1); // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we // manually ready the output dataset if (Hadoop.isHadoop1()) { ((Signalable) outputDataset).signalReady(); } else { // under hadoop2 the output will have been marked ready Assert.assertTrue( "output dataset should be ready after mapreduce", ((Signalable) outputDataset).isReady()); } long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified(); // ensure output is newer than input on local filesystems with 1s granularity Thread.sleep(1000); // now output to a view, this ensures that the view isn't ready View<Record> outputView = outputDataset.with("username", "test-0"); // re-run without changing input and output should change since the view is not ready runCheckpointPipeline(inputDataset, outputView); checkTestUsers(outputDataset, 1); Assert.assertTrue(((LastModifiedAccessor) outputView).getLastModified() > lastModified); }
/** * Check whether a {@link Dataset} identified by the given URI exists. * * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific, * depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @return {@code true} if the dataset exists, {@code false} otherwise * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a dataset URI */ public static boolean exists(URI uri) { Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); return repo.exists( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION)); }
@Test(expected = CrunchRuntimeException.class) public void testWriteModeDefaultFailsWithExisting() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 0); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset)); }
/** * List the {@link Dataset} URIs in the repository identified by the URI. * * <p>URI formats are defined by {@code Dataset} implementations. The repository URIs you pass to * this method must begin with {@code repo:}. For example, to list the {@code Dataset} URIs for * the Hive repository, provide the URI {@code repo:hive}. * * @param uri a {@code DatasetRepository} URI * @return the URIs present in the {@code DatasetRepository} * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a repository URI */ public static Collection<URI> list(URI uri) { boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri); DatasetRepository repo = Registration.open(URI.create(uri.getRawSchemeSpecificPart())); // build a URI for each dataset name URI repoUri = repo.getUri(); List<URI> datasets = Lists.newArrayList(); for (String namespace : repo.namespaces()) { for (String dataset : repo.datasets(namespace)) { datasets.add(new URIBuilder(repoUri, namespace, dataset).build()); } } return datasets; }
@Test public void testGeneric() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
@Test public void testWriteModeOverwrite() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 1); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write( data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE); pipeline.run(); checkTestUsers(outputDataset, 1); }
@Test public void testWriteModeCheckpoint() throws Exception { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); Thread.sleep( 1000); // ensure output is newer than input on local filesystems with 1s granularity runCheckpointPipeline(inputDataset, outputDataset); // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we // manually ready the output dataset if (Hadoop.isHadoop1()) { ((Signalable) outputDataset).signalReady(); } checkTestUsers(outputDataset, 1); long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified(); // re-run without changing input and output should not change runCheckpointPipeline(inputDataset, outputDataset); checkTestUsers(outputDataset, 1); Assert.assertEquals(lastModified, ((LastModifiedAccessor) outputDataset).getLastModified()); // re-write input then re-run and output should be re-written Thread.sleep(1000); // ensure new input is newer than output repo.delete("ns", "in"); inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); runCheckpointPipeline(inputDataset, outputDataset); checkTestUsers(outputDataset, 1); Assert.assertTrue(((LastModifiedAccessor) outputDataset).getLastModified() > lastModified); }
/** * Update a {@link Dataset} for the given dataset or view URI. * * <p>You can add columns, remove columns, or change the data type of columns in your dataset, * provided you don't attempt a change that is incompatible with written data. Avro defines rules * for compatible schema evolution. See <a * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>. * * <p>This method updates the dataset descriptor, so you can also add or change properties. * * <p>The recommended way to update a dataset descriptor is to build it based on an existing * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a * DatasetDescriptor based on an existing instance. * * <p>You cannot change a dataset format or partition strategy. * * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific, * depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @param type a Java class that represents an entity in the dataset * @param <E> the type used for readers and writers created by this {@code Dataset} * @param <D> the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code * null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws UnsupportedOperationException if descriptor updates are not supported by the * implementation * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated * concurrently * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or * with existing datasets with shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static <E, D extends Dataset<E>> D update( URI uri, DatasetDescriptor descriptor, Class<E> type) { Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull( type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); return (D) repo.update( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type); }
@Before public void setUp() throws Exception { this.repo = newRepo(); repo.delete("ns", "in"); repo.delete("ns", "out"); }