示例#1
0
  @Test
  public void testTargetView() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-0");
    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("username", "test-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
  }
示例#2
0
  @Test
  public void testSignalReadyOutputView() {
    Assume.assumeTrue(!Hadoop.isHadoop1());
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
    View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
    Assert.assertEquals(2, datasetSize(inputView));

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(2, datasetSize(outputView));

    Assert.assertFalse(
        "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady());
    Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady());
  }
示例#3
0
  @Test
  public void testPartitionedSource() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build());

    writeTestUsers(inputDataset, 10);

    PartitionKey key = new PartitionKey(0);
    Dataset<Record> inputPart0 =
        ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(5, datasetSize(outputDataset));
  }
示例#4
0
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
示例#5
0
  @Test
  public void testWriteModeCheckpointToNotReadyOutput() throws Exception {
    // identity partition so we can overwrite the output
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().identity("username").build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 1, 0);

    // ensure output is newer than input on local filesystems with 1s granularity
    Thread.sleep(1000);

    runCheckpointPipeline(inputDataset, outputDataset);

    checkTestUsers(outputDataset, 1);

    // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
    // manually ready the output dataset
    if (Hadoop.isHadoop1()) {
      ((Signalable) outputDataset).signalReady();
    } else {
      // under hadoop2 the output will have been marked ready
      Assert.assertTrue(
          "output dataset should be ready after mapreduce", ((Signalable) outputDataset).isReady());
    }

    long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();

    // ensure output is newer than input on local filesystems with 1s granularity
    Thread.sleep(1000);

    // now output to a view, this ensures that the view isn't ready
    View<Record> outputView = outputDataset.with("username", "test-0");

    // re-run without changing input and output should change since the view is not ready
    runCheckpointPipeline(inputDataset, outputView);
    checkTestUsers(outputDataset, 1);
    Assert.assertTrue(((LastModifiedAccessor) outputView).getLastModified() > lastModified);
  }
示例#6
0
  @Test(expected = CrunchRuntimeException.class)
  public void testWriteModeDefaultFailsWithExisting() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 0);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
  }
示例#7
0
  /**
   * Create a {@link Dataset} for the given dataset or view URI. {@code create} returns an empty
   * dataset. You can use {@code DatasetWriter} to populate your dataset.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
   * creates the underlying dataset and returns a view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
   *     null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
   *     shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V create(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
      descriptor =
          new DatasetDescriptor.Builder(descriptor).location(uriOptions.get("location")).build();
    }

    Dataset<E> dataset =
        repo.create(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            descriptor,
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      return (V) dataset;
    }
  }
示例#8
0
  @Test
  public void testGeneric() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    // write two files, each of 5 records
    writeTestUsers(inputDataset, 5, 0);
    writeTestUsers(inputDataset, 5, 5);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    checkTestUsers(outputDataset, 10);
  }
示例#9
0
  @Test
  public void testWriteModeOverwrite() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 1);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(
        data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE);

    pipeline.run();

    checkTestUsers(outputDataset, 1);
  }
示例#10
0
  @Test
  public void testWriteModeCheckpoint() throws Exception {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);

    Thread.sleep(
        1000); // ensure output is newer than input on local filesystems with 1s granularity
    runCheckpointPipeline(inputDataset, outputDataset);

    // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
    // manually ready the output dataset
    if (Hadoop.isHadoop1()) {
      ((Signalable) outputDataset).signalReady();
    }

    checkTestUsers(outputDataset, 1);

    long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();

    // re-run without changing input and output should not change
    runCheckpointPipeline(inputDataset, outputDataset);
    checkTestUsers(outputDataset, 1);
    Assert.assertEquals(lastModified, ((LastModifiedAccessor) outputDataset).getLastModified());

    // re-write input then re-run and output should be re-written
    Thread.sleep(1000); // ensure new input is newer than output
    repo.delete("ns", "in");
    inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    writeTestUsers(inputDataset, 1, 0);
    runCheckpointPipeline(inputDataset, outputDataset);

    checkTestUsers(outputDataset, 1);
    Assert.assertTrue(((LastModifiedAccessor) outputDataset).getLastModified() > lastModified);
  }