@Override public void write(E entity) { Preconditions.checkState( state.equals(ReaderWriterState.OPEN), "Attempt to write to a writer in state:%s", state); reusedKey.reuseFor(entity); DatasetWriter<E> writer = cachedWriters.getIfPresent(reusedKey); if (writer == null) { // avoid checking in every whether the entity belongs in the view by only // checking when a new writer is created Preconditions.checkArgument( view.includes(entity), "View %s does not include entity %s", view, entity); // get a new key because it is stored in the cache StorageKey key = StorageKey.copy(reusedKey); try { writer = cachedWriters.getUnchecked(key); } catch (UncheckedExecutionException ex) { throw new IllegalArgumentException( "Problem creating view for entity: " + entity, ex.getCause()); } } writer.write(entity); }
@Override public void onRemoval(RemovalNotification<StorageKey, DatasetWriter<E>> notification) { DatasetWriter<E> writer = notification.getValue(); logger.debug("Closing writer:{} for partition:{}", writer, notification.getKey()); writer.close(); }
@Test public void testUseReaderSchema() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
@Override public void close() { if (state.equals(ReaderWriterState.OPEN)) { logger.debug("Closing all cached writers for view:{}", view); for (DatasetWriter<E> writer : cachedWriters.asMap().values()) { logger.debug("Closing partition writer:{}", writer); writer.close(); } state = ReaderWriterState.CLOSED; } }
private static void writeTestRecords(View<TestRecord> view) { DatasetWriter<TestRecord> writer = null; try { writer = view.newWriter(); for (int i = 0; i < 10; i += 1) { TestRecord record = new TestRecord(); record.id = i; record.data = "test/-" + i; writer.write(record); } } finally { if (writer != null) { writer.close(); } } }
@Override public void flush() { Preconditions.checkState( state.equals(ReaderWriterState.OPEN), "Attempt to write to a writer in state:%s", state); logger.debug("Flushing all cached writers for view:{}", view); /* * There's a potential for flushing entries that are created by other * threads while looping through the writers. While normally just wasteful, * on HDFS, this is particularly bad. We should probably do something about * this, but it will be difficult as Cache (ideally) uses multiple * partitions to prevent cached writer contention. */ for (DatasetWriter<E> writer : cachedWriters.asMap().values()) { logger.debug("Flushing partition writer:{}", writer); writer.flush(); } }
@Override public DatasetWriter<E> load(StorageKey key) throws Exception { Preconditions.checkState( view.getDataset() instanceof FileSystemDataset, "FileSystemWriters cannot create writer for " + view.getDataset()); FileSystemDataset dataset = (FileSystemDataset) view.getDataset(); Path partition = convert.fromKey(key); DatasetWriter<E> writer = new FileSystemWriter<E>( dataset.getFileSystem(), new Path(dataset.getDirectory(), partition), dataset.getDescriptor()); PartitionListener listener = dataset.getPartitionListener(); if (listener != null) { listener.partitionAdded(dataset.getName(), partition.toString()); } writer.open(); return writer; }