@Test
  public void testSignalReadyOutputView() {
    Assume.assumeTrue(!Hadoop.isHadoop1());
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
    View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
    Assert.assertEquals(2, datasetSize(inputView));

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(2, datasetSize(outputView));

    Assert.assertFalse(
        "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady());
    Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady());
  }
  @Test
  public void testTargetView() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-0");
    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("username", "test-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
  }
Beispiel #3
0
  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme.
   *
   * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view
   * URI, {@code load} returns a {@code View} configured to read a subset of the dataset.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V load(URI uri, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    Dataset<E> dataset =
        repo.load(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      // if the URI isn't a view URI, only load the dataset
      return (V) dataset;
    }
  }
  @Test
  public void testPartitionedSource() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build());

    writeTestUsers(inputDataset, 10);

    PartitionKey key = new PartitionKey(0);
    Dataset<Record> inputPart0 =
        ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(5, datasetSize(outputDataset));
  }
Beispiel #5
0
  /**
   * Create a {@link Dataset} for the given dataset or view URI. {@code create} returns an empty
   * dataset. You can use {@code DatasetWriter} to populate your dataset.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
   * creates the underlying dataset and returns a view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
   *     null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
   *     shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V create(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
      descriptor =
          new DatasetDescriptor.Builder(descriptor).location(uriOptions.get("location")).build();
    }

    Dataset<E> dataset =
        repo.create(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            descriptor,
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      return (V) dataset;
    }
  }
Beispiel #6
0
  @Override
  public int run() throws IOException {
    DatasetRepository repo = getDatasetRepository();

    if (targets == null || targets.isEmpty()) {
      throw new IllegalArgumentException("No views or datasets were specified.");
    }

    for (String uriOrName : targets) {
      if (isViewUri(uriOrName)) {
        View view = Datasets.load(uriOrName);
        Preconditions.checkArgument(
            viewMatches(view.getUri(), uriOrName),
            "Resolved view does not match requested view: " + view.getUri());
        view.deleteAll();
      } else if (isDatasetUri(uriOrName)) {
        Datasets.delete(uriOrName);
      } else {
        repo.delete(namespace, uriOrName);
      }
      console.debug("Deleted {}", uriOrName);
    }

    return 0;
  }
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
  @Test
  public void testWriteModeCheckpointToNotReadyOutput() throws Exception {
    // identity partition so we can overwrite the output
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().identity("username").build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 1, 0);

    // ensure output is newer than input on local filesystems with 1s granularity
    Thread.sleep(1000);

    runCheckpointPipeline(inputDataset, outputDataset);

    checkTestUsers(outputDataset, 1);

    // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
    // manually ready the output dataset
    if (Hadoop.isHadoop1()) {
      ((Signalable) outputDataset).signalReady();
    } else {
      // under hadoop2 the output will have been marked ready
      Assert.assertTrue(
          "output dataset should be ready after mapreduce", ((Signalable) outputDataset).isReady());
    }

    long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();

    // ensure output is newer than input on local filesystems with 1s granularity
    Thread.sleep(1000);

    // now output to a view, this ensures that the view isn't ready
    View<Record> outputView = outputDataset.with("username", "test-0");

    // re-run without changing input and output should change since the view is not ready
    runCheckpointPipeline(inputDataset, outputView);
    checkTestUsers(outputDataset, 1);
    Assert.assertTrue(((LastModifiedAccessor) outputView).getLastModified() > lastModified);
  }
Beispiel #9
0
  /**
   * Check whether a {@link Dataset} identified by the given URI exists.
   *
   * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
   * depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if the dataset exists, {@code false} otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean exists(URI uri) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri);

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return repo.exists(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
  }
Beispiel #10
0
  @Test(expected = CrunchRuntimeException.class)
  public void testWriteModeDefaultFailsWithExisting() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 0);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
  }
Beispiel #11
0
  /**
   * List the {@link Dataset} URIs in the repository identified by the URI.
   *
   * <p>URI formats are defined by {@code Dataset} implementations. The repository URIs you pass to
   * this method must begin with {@code repo:}. For example, to list the {@code Dataset} URIs for
   * the Hive repository, provide the URI {@code repo:hive}.
   *
   * @param uri a {@code DatasetRepository} URI
   * @return the URIs present in the {@code DatasetRepository}
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a repository URI
   */
  public static Collection<URI> list(URI uri) {
    boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri);
    DatasetRepository repo = Registration.open(URI.create(uri.getRawSchemeSpecificPart()));

    // build a URI for each dataset name
    URI repoUri = repo.getUri();
    List<URI> datasets = Lists.newArrayList();
    for (String namespace : repo.namespaces()) {
      for (String dataset : repo.datasets(namespace)) {
        datasets.add(new URIBuilder(repoUri, namespace, dataset).build());
      }
    }

    return datasets;
  }
Beispiel #12
0
  @Test
  public void testGeneric() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    // write two files, each of 5 records
    writeTestUsers(inputDataset, 5, 0);
    writeTestUsers(inputDataset, 5, 5);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    checkTestUsers(outputDataset, 10);
  }
Beispiel #13
0
  @Test
  public void testWriteModeOverwrite() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 1);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(
        data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE);

    pipeline.run();

    checkTestUsers(outputDataset, 1);
  }
Beispiel #14
0
  @Test
  public void testWriteModeCheckpoint() throws Exception {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);

    Thread.sleep(
        1000); // ensure output is newer than input on local filesystems with 1s granularity
    runCheckpointPipeline(inputDataset, outputDataset);

    // under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
    // manually ready the output dataset
    if (Hadoop.isHadoop1()) {
      ((Signalable) outputDataset).signalReady();
    }

    checkTestUsers(outputDataset, 1);

    long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();

    // re-run without changing input and output should not change
    runCheckpointPipeline(inputDataset, outputDataset);
    checkTestUsers(outputDataset, 1);
    Assert.assertEquals(lastModified, ((LastModifiedAccessor) outputDataset).getLastModified());

    // re-write input then re-run and output should be re-written
    Thread.sleep(1000); // ensure new input is newer than output
    repo.delete("ns", "in");
    inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    writeTestUsers(inputDataset, 1, 0);
    runCheckpointPipeline(inputDataset, outputDataset);

    checkTestUsers(outputDataset, 1);
    Assert.assertTrue(((LastModifiedAccessor) outputDataset).getLastModified() > lastModified);
  }
Beispiel #15
0
  /**
   * Update a {@link Dataset} for the given dataset or view URI.
   *
   * <p>You can add columns, remove columns, or change the data type of columns in your dataset,
   * provided you don't attempt a change that is incompatible with written data. Avro defines rules
   * for compatible schema evolution. See <a
   * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>.
   *
   * <p>This method updates the dataset descriptor, so you can also add or change properties.
   *
   * <p>The recommended way to update a dataset descriptor is to build it based on an existing
   * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a
   * DatasetDescriptor based on an existing instance.
   *
   * <p>You cannot change a dataset format or partition strategy.
   *
   * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
   * depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <D> the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
   *     null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws UnsupportedOperationException if descriptor updates are not supported by the
   *     implementation
   * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated
   *     concurrently
   * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or
   *     with existing datasets with shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, D extends Dataset<E>> D update(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return (D)
        repo.update(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            descriptor,
            type);
  }
Beispiel #16
0
 @Before
 public void setUp() throws Exception {
   this.repo = newRepo();
   repo.delete("ns", "in");
   repo.delete("ns", "out");
 }