@Override
  public void write(E entity) {
    Preconditions.checkState(
        state.equals(ReaderWriterState.OPEN), "Attempt to write to a writer in state:%s", state);

    reusedKey.reuseFor(entity);

    DatasetWriter<E> writer = cachedWriters.getIfPresent(reusedKey);
    if (writer == null) {
      // avoid checking in every whether the entity belongs in the view by only
      // checking when a new writer is created
      Preconditions.checkArgument(
          view.includes(entity), "View %s does not include entity %s", view, entity);
      // get a new key because it is stored in the cache
      StorageKey key = StorageKey.copy(reusedKey);
      try {
        writer = cachedWriters.getUnchecked(key);
      } catch (UncheckedExecutionException ex) {
        throw new IllegalArgumentException(
            "Problem creating view for entity: " + entity, ex.getCause());
      }
    }

    writer.write(entity);
  }
    @Override
    public void onRemoval(RemovalNotification<StorageKey, DatasetWriter<E>> notification) {

      DatasetWriter<E> writer = notification.getValue();

      logger.debug("Closing writer:{} for partition:{}", writer, notification.getKey());

      writer.close();
    }
Example #3
0
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
  @Override
  public void close() {
    if (state.equals(ReaderWriterState.OPEN)) {

      logger.debug("Closing all cached writers for view:{}", view);

      for (DatasetWriter<E> writer : cachedWriters.asMap().values()) {
        logger.debug("Closing partition writer:{}", writer);
        writer.close();
      }

      state = ReaderWriterState.CLOSED;
    }
  }
  private static void writeTestRecords(View<TestRecord> view) {
    DatasetWriter<TestRecord> writer = null;
    try {
      writer = view.newWriter();
      for (int i = 0; i < 10; i += 1) {
        TestRecord record = new TestRecord();
        record.id = i;
        record.data = "test/-" + i;
        writer.write(record);
      }

    } finally {
      if (writer != null) {
        writer.close();
      }
    }
  }
  @Override
  public void flush() {
    Preconditions.checkState(
        state.equals(ReaderWriterState.OPEN), "Attempt to write to a writer in state:%s", state);

    logger.debug("Flushing all cached writers for view:{}", view);

    /*
     * There's a potential for flushing entries that are created by other
     * threads while looping through the writers. While normally just wasteful,
     * on HDFS, this is particularly bad. We should probably do something about
     * this, but it will be difficult as Cache (ideally) uses multiple
     * partitions to prevent cached writer contention.
     */
    for (DatasetWriter<E> writer : cachedWriters.asMap().values()) {
      logger.debug("Flushing partition writer:{}", writer);
      writer.flush();
    }
  }
    @Override
    public DatasetWriter<E> load(StorageKey key) throws Exception {
      Preconditions.checkState(
          view.getDataset() instanceof FileSystemDataset,
          "FileSystemWriters cannot create writer for " + view.getDataset());

      FileSystemDataset dataset = (FileSystemDataset) view.getDataset();
      Path partition = convert.fromKey(key);
      DatasetWriter<E> writer =
          new FileSystemWriter<E>(
              dataset.getFileSystem(),
              new Path(dataset.getDirectory(), partition),
              dataset.getDescriptor());

      PartitionListener listener = dataset.getPartitionListener();
      if (listener != null) {
        listener.partitionAdded(dataset.getName(), partition.toString());
      }

      writer.open();

      return writer;
    }