@Override
  public <E> Dataset<E> create(String name, DatasetDescriptor descriptor) {

    Preconditions.checkArgument(name != null, "Name can not be null");
    Preconditions.checkArgument(descriptor != null, "Descriptor can not be null");
    Preconditions.checkArgument(
        descriptor.getLocation() == null,
        "Descriptor location cannot be set; " + "it is assigned by the MetadataProvider");

    DatasetDescriptor newDescriptor = metadataProvider.create(name, descriptor);
    newDescriptor = addRepositoryUri(newDescriptor);

    final URI location = newDescriptor.getLocation();
    if (location == null) {
      throw new DatasetRepositoryException(
          "[BUG] MetadataProvider did not assign a location to dataset:" + name);
    }

    ensureExists(newDescriptor, conf);

    logger.debug(
        "Created dataset:{} schema:{} datasetPath:{}",
        new Object[] {name, newDescriptor.getSchema(), location.toString()});

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(newDescriptor)
        .partitionKey(
            newDescriptor.isPartitioned()
                ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                : null)
        .partitionListener(getPartitionListener())
        .build();
  }
  @Override
  public <E> Dataset<E> load(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");

    logger.debug("Loading dataset:{}", name);

    DatasetDescriptor descriptor = metadataProvider.load(name);
    descriptor = addRepositoryUri(descriptor);

    FileSystemDataset<E> ds =
        new FileSystemDataset.Builder()
            .name(name)
            .configuration(conf)
            .descriptor(descriptor)
            .partitionKey(
                descriptor.isPartitioned()
                    ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                    : null)
            .partitionListener(getPartitionListener())
            .build();

    logger.debug("Loaded dataset:{}", ds);

    return ds;
  }
  /**
   * Get a {@link org.kitesdk.data.PartitionKey} corresponding to a partition's filesystem path
   * represented as a {@link URI}. If the path is not a valid partition, then {@link
   * IllegalArgumentException} is thrown. Note that the partition does not have to exist.
   *
   * @param dataset the filesystem dataset
   * @param partitionPath a directory path where the partition data is stored
   * @return a partition key representing the partition at the given path
   * @since 0.4.0
   */
  @SuppressWarnings("deprecation")
  public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) {
    Preconditions.checkState(
        dataset.getDescriptor().isPartitioned(),
        "Attempt to get a partition on a non-partitioned dataset (name:%s)",
        dataset.getName());

    Preconditions.checkArgument(
        dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset");
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;

    FileSystem fs = fsDataset.getFileSystem();
    URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri();
    URI directoryUri = fsDataset.getDirectory().toUri();
    URI relativizedUri = directoryUri.relativize(partitionUri);

    if (relativizedUri.equals(partitionUri)) {
      throw new IllegalArgumentException(
          String.format(
              "Partition URI %s has different " + "root directory to dataset (directory: %s).",
              partitionUri, directoryUri));
    }

    Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath());

    PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
    List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners();
    if (Iterables.size(parts) > fieldPartitioners.size()) {
      throw new IllegalArgumentException(
          String.format(
              "Too many partition directories " + "for %s (%s), expecting %s.",
              partitionUri, Iterables.size(parts), fieldPartitioners.size()));
    }

    List<Object> values = Lists.newArrayList();
    int i = 0;
    for (String part : parts) {
      Iterator<String> split = Splitter.on('=').split(part).iterator();
      String fieldName = split.next();
      FieldPartitioner fp = fieldPartitioners.get(i++);
      if (!fieldName.equals(fp.getName())) {
        throw new IllegalArgumentException(
            String.format(
                "Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.",
                fieldName, partitionUri, fp.getName()));
      }
      if (!split.hasNext()) {
        throw new IllegalArgumentException(
            String.format(
                "Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri));
      }
      String stringValue = split.next();
      Object value = fp.valueFromString(stringValue);
      values.add(value);
    }
    return org.kitesdk.data.impl.Accessor.getDefault()
        .newPartitionKey(values.toArray(new Object[values.size()]));
  }
Beispiel #4
0
  @SuppressWarnings("unchecked")
  PartitionKey fromDirectoryName(Path dir) {
    final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(0);
    final List<Object> values = Lists.newArrayList();

    if (partitionKey != null) {
      values.addAll(partitionKey.getValues());
    }

    values.add(convert.valueForDirname(fp, dir.getName()));

    return Accessor.getDefault().newPartitionKey(values.toArray());
  }
Beispiel #5
0
  @Override
  @Nullable
  @Deprecated
  public Dataset<E> getPartition(PartitionKey key, boolean allowCreate) {
    Preconditions.checkState(
        descriptor.isPartitioned(),
        "Attempt to get a partition on a non-partitioned dataset (name:%s)",
        name);

    logger.debug("Loading partition for key {}, allowCreate:{}", new Object[] {key, allowCreate});

    Path partitionDirectory = fileSystem.makeQualified(toDirectoryName(directory, key));

    try {
      if (!fileSystem.exists(partitionDirectory)) {
        if (allowCreate) {
          fileSystem.mkdirs(partitionDirectory);
          if (partitionListener != null) {
            partitionListener.partitionAdded(name, toRelativeDirectory(key).toString());
          }
        } else {
          return null;
        }
      }
    } catch (IOException e) {
      throw new DatasetException(
          "Unable to locate or create dataset partition directory " + partitionDirectory, e);
    }

    int partitionDepth = key.getLength();
    PartitionStrategy subpartitionStrategy =
        Accessor.getDefault().getSubpartitionStrategy(partitionStrategy, partitionDepth);

    return new FileSystemDataset.Builder()
        .name(name)
        .fileSystem(fileSystem)
        .descriptor(
            new DatasetDescriptor.Builder(descriptor)
                .location(partitionDirectory)
                .partitionStrategy(subpartitionStrategy)
                .build())
        .partitionKey(key)
        .partitionListener(partitionListener)
        .build();
  }
Beispiel #6
0
  @Override
  @Deprecated
  public Iterable<Dataset<E>> getPartitions() {
    Preconditions.checkState(
        descriptor.isPartitioned(),
        "Attempt to get partitions on a non-partitioned dataset (name:%s)",
        name);

    List<Dataset<E>> partitions = Lists.newArrayList();

    FileStatus[] fileStatuses;

    try {
      fileStatuses = fileSystem.listStatus(directory, PathFilters.notHidden());
    } catch (IOException e) {
      throw new DatasetException(
          "Unable to list partition directory for directory " + directory, e);
    }

    for (FileStatus stat : fileStatuses) {
      Path p = fileSystem.makeQualified(stat.getPath());
      PartitionKey key = fromDirectoryName(p);
      PartitionStrategy subPartitionStrategy =
          Accessor.getDefault().getSubpartitionStrategy(partitionStrategy, 1);
      Builder builder =
          new FileSystemDataset.Builder()
              .name(name)
              .fileSystem(fileSystem)
              .descriptor(
                  new DatasetDescriptor.Builder(descriptor)
                      .location(p)
                      .partitionStrategy(subPartitionStrategy)
                      .build())
              .partitionKey(key)
              .partitionListener(partitionListener);

      partitions.add(builder.<E>build());
    }

    return partitions;
  }
  @Override
  public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");
    Preconditions.checkArgument(descriptor != null, "DatasetDescriptro cannot be null");

    DatasetDescriptor oldDescriptor = metadataProvider.load(name);

    // oldDescriptor is valid if load didn't throw NoSuchDatasetException

    if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) {
      throw new DatasetRepositoryException(
          "Cannot change dataset format from "
              + oldDescriptor.getFormat()
              + " to "
              + descriptor.getFormat());
    }

    final URI oldLocation = oldDescriptor.getLocation();
    if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) {
      throw new DatasetRepositoryException("Cannot change the dataset's location");
    }

    if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) {
      throw new DatasetRepositoryException(
          "Cannot change an unpartitioned dataset to " + " partitioned or vice versa.");
    } else if (oldDescriptor.isPartitioned()
        && descriptor.isPartitioned()
        && !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) {
      throw new DatasetRepositoryException(
          "Cannot change partition strategy from "
              + oldDescriptor.getPartitionStrategy()
              + " to "
              + descriptor.getPartitionStrategy());
    }

    // check can read records written with old schema using new schema
    final Schema oldSchema = oldDescriptor.getSchema();
    final Schema newSchema = descriptor.getSchema();
    if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) {
      throw new IncompatibleSchemaException(
          "New schema cannot read data "
              + "written using "
              + "old schema. New schema: "
              + newSchema.toString(true)
              + "\nOld schema: "
              + oldSchema.toString(true));
    }

    DatasetDescriptor updatedDescriptor = metadataProvider.update(name, descriptor);
    updatedDescriptor = addRepositoryUri(updatedDescriptor);

    logger.debug(
        "Updated dataset:{} schema:{} datasetPath:{}",
        new Object[] {
          name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation().toString()
        });

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(updatedDescriptor)
        .partitionKey(
            updatedDescriptor.isPartitioned()
                ? org.kitesdk.data.impl.Accessor.getDefault().newPartitionKey()
                : null)
        .partitionListener(getPartitionListener())
        .build();
  }
Beispiel #8
0
 static {
   Accessor.setDefault(new AccessorImpl());
 }