Example #1
0
  @Test
  public void testSignalReadyOutputView() {
    Assume.assumeTrue(!Hadoop.isHadoop1());
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
    View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
    Assert.assertEquals(2, datasetSize(inputView));

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(2, datasetSize(outputView));

    Assert.assertFalse(
        "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady());
    Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady());
  }
Example #2
0
  @Test
  public void testWriteModeCheckpointToNotReadyOutput() throws Exception {
    // identity partition so we can overwrite the output
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().identity("username").build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 1, 0);

    // ensure output is newer than input on local filesystems with 1s granularity
    Thread.sleep(1000);

    runCheckpointPipeline(inputDataset, outputDataset);

    checkTestUsers(outputDataset, 1);

    // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
    // manually ready the output dataset
    if (Hadoop.isHadoop1()) {
      ((Signalable) outputDataset).signalReady();
    } else {
      // under hadoop2 the output will have been marked ready
      Assert.assertTrue(
          "output dataset should be ready after mapreduce", ((Signalable) outputDataset).isReady());
    }

    long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();

    // ensure output is newer than input on local filesystems with 1s granularity
    Thread.sleep(1000);

    // now output to a view, this ensures that the view isn't ready
    View<Record> outputView = outputDataset.with("username", "test-0");

    // re-run without changing input and output should change since the view is not ready
    runCheckpointPipeline(inputDataset, outputView);
    checkTestUsers(outputDataset, 1);
    Assert.assertTrue(((LastModifiedAccessor) outputView).getLastModified() > lastModified);
  }
Example #3
0
  @Test
  public void testWriteModeCheckpoint() throws Exception {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);

    Thread.sleep(
        1000); // ensure output is newer than input on local filesystems with 1s granularity
    runCheckpointPipeline(inputDataset, outputDataset);

    // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
    // manually ready the output dataset
    if (Hadoop.isHadoop1()) {
      ((Signalable) outputDataset).signalReady();
    }

    checkTestUsers(outputDataset, 1);

    long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();

    // re-run without changing input and output should not change
    runCheckpointPipeline(inputDataset, outputDataset);
    checkTestUsers(outputDataset, 1);
    Assert.assertEquals(lastModified, ((LastModifiedAccessor) outputDataset).getLastModified());

    // re-write input then re-run and output should be re-written
    Thread.sleep(1000); // ensure new input is newer than output
    repo.delete("ns", "in");
    inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    writeTestUsers(inputDataset, 1, 0);
    runCheckpointPipeline(inputDataset, outputDataset);

    checkTestUsers(outputDataset, 1);
    Assert.assertTrue(((LastModifiedAccessor) outputDataset).getLastModified() > lastModified);
  }