Пример #1
0
  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme.
   *
   * <p>If you use a dataset URI, {@code load} returns the unfiltered dataset. If you use a view
   * URI, {@code load} returns a {@code View} configured to read a subset of the dataset.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V load(URI uri, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    Dataset<E> dataset =
        repo.load(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      // if the URI isn't a view URI, only load the dataset
      return (V) dataset;
    }
  }
Пример #2
0
  /**
   * Create a {@link Dataset} for the given dataset or view URI. {@code create} returns an empty
   * dataset. You can use {@code DatasetWriter} to populate your dataset.
   *
   * <p>URIs must begin with {@code dataset:} or {@code view:}. The remainder of the URI is
   * implementation specific, depending on the dataset scheme. If the URI is a view URI, this method
   * creates the underlying dataset and returns a view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <V> the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
   *     null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException if the schema is not compatible with existing datasets with
   *     shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, V extends View<E>> V create(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(
        isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
      descriptor =
          new DatasetDescriptor.Builder(descriptor).location(uriOptions.get("location")).build();
    }

    Dataset<E> dataset =
        repo.create(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            descriptor,
            type);

    if (isView) {
      return Datasets.<E, V>view(dataset, uriOptions);
    } else {
      return (V) dataset;
    }
  }
Пример #3
0
  /**
   * Check whether a {@link Dataset} identified by the given URI exists.
   *
   * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
   * depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if the dataset exists, {@code false} otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean exists(URI uri) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri);

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return repo.exists(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
  }
Пример #4
0
  @Override
  public void load() {
    OptionBuilder<DatasetRepository> builder = new URIBuilder();

    Registration.register(
        new URIPattern("file:/*path?absolute=true"),
        new URIPattern("file:/*path/:namespace/:dataset?absolute=true"),
        builder);
    Registration.register(
        new URIPattern("file:*path"), new URIPattern("file:*path/:namespace/:dataset"), builder);

    Registration.register(
        new URIPattern("hdfs:/*path?absolute=true"),
        new URIPattern("hdfs:/*path/:namespace/:dataset?absolute=true"),
        builder);
    Registration.register(
        new URIPattern("hdfs:*path"), new URIPattern("hdfs:*path/:namespace/:dataset"), builder);

    Registration.register(
        new URIPattern("webhdfs:/*path?absolute=true"),
        new URIPattern("webhdfs:/*path/:namespace/:dataset?absolute=true"),
        builder);
  }
Пример #5
0
  /**
   * List the {@link Dataset} URIs in the repository identified by the URI.
   *
   * <p>URI formats are defined by {@code Dataset} implementations. The repository URIs you pass to
   * this method must begin with {@code repo:}. For example, to list the {@code Dataset} URIs for
   * the Hive repository, provide the URI {@code repo:hive}.
   *
   * @param uri a {@code DatasetRepository} URI
   * @return the URIs present in the {@code DatasetRepository}
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a repository URI
   */
  public static Collection<URI> list(URI uri) {
    boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri);
    DatasetRepository repo = Registration.open(URI.create(uri.getRawSchemeSpecificPart()));

    // build a URI for each dataset name
    URI repoUri = repo.getUri();
    List<URI> datasets = Lists.newArrayList();
    for (String namespace : repo.namespaces()) {
      for (String dataset : repo.datasets(namespace)) {
        datasets.add(new URIBuilder(repoUri, namespace, dataset).build());
      }
    }

    return datasets;
  }
Пример #6
0
  /**
   * Update a {@link Dataset} for the given dataset or view URI.
   *
   * <p>You can add columns, remove columns, or change the data type of columns in your dataset,
   * provided you don't attempt a change that is incompatible with written data. Avro defines rules
   * for compatible schema evolution. See <a
   * href="http://kitesdk.org/docs/current/Schema-Evolution.html">Schema Evolution</a>.
   *
   * <p>This method updates the dataset descriptor, so you can also add or change properties.
   *
   * <p>The recommended way to update a dataset descriptor is to build it based on an existing
   * descriptor. Use {@link DatasetDescriptor.Builder(DatasetDescriptor)} to build a
   * DatasetDescriptor based on an existing instance.
   *
   * <p>You cannot change a dataset format or partition strategy.
   *
   * <p>URIs must begin with {@code dataset:}. The remainder of the URI is implementation specific,
   * depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @param type a Java class that represents an entity in the dataset
   * @param <E> the type used for readers and writers created by this {@code Dataset}
   * @param <D> the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException if {@code uri}, {@code descriptor}, or {@code type} is {@code
   *     null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws UnsupportedOperationException if descriptor updates are not supported by the
   *     implementation
   * @throws ConcurrentSchemaModificationException if the {@code Dataset} schema is updated
   *     concurrently
   * @throws IncompatibleSchemaException if the schema is not compatible with previous schemas, or
   *     with existing datasets with shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static <E, D extends Dataset<E>> D update(
      URI uri, DatasetDescriptor descriptor, Class<E> type) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(
        type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair<DatasetRepository, Map<String, String>> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map<String, String> uriOptions = pair.second();

    return (D)
        repo.update(
            uriOptions.get(URIBuilder.NAMESPACE_OPTION),
            uriOptions.get(URIBuilder.DATASET_NAME_OPTION),
            descriptor,
            type);
  }