@Test public void testGetProgressFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) { assertEquals(new Double(0.0), reader.getFractionConsumed()); } List<? extends BoundedSource<FixedRecord>> splits = source.splitIntoBundles(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) { assertEquals(new Double(0.0), reader.getFractionConsumed()); } } }
@Test public void testCreationWithSchema() throws Exception { List<Bird> expected = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); // Create a source with a schema object Schema schema = ReflectData.get().getSchema(Bird.class); AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema); List<GenericRecord> records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); // Create a source with a JSON schema String schemaString = ReflectData.get().getSchema(Bird.class).toString(); source = AvroSource.from(filename).withSchema(schemaString); records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); // Create a source with no schema source = AvroSource.from(filename); records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); }
@Test public void testSplitAtFraction() throws Exception { List<FixedRecord> expected = createFixedRecords(DEFAULT_RECORD_COUNT); // Create an AvroSource where each block is 16k String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_REGULAR, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); List<? extends BoundedSource<FixedRecord>> splits = source.splitIntoBundles(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { int items = SourceTestUtils.readFromSource(subSource, null).size(); // Shouldn't split while unstarted. SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null); SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 100, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1000, 0.1, null); SourceTestUtils.assertSplitAtFractionFails(subSource, 1001, 0.1, null); SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null); } }
/** * Generates an input Avro file containing the given records in the temporary directory and * returns the full path of the file. */ private <T> String generateTestFile( String filename, List<T> elems, SyncBehavior syncBehavior, int syncInterval, AvroCoder<T> coder, String codec) throws IOException { Random random = new Random(0); File tmpFile = tmpFolder.newFile(filename); String path = tmpFile.toString(); FileOutputStream os = new FileOutputStream(tmpFile); DatumWriter<T> datumWriter = coder.createDatumWriter(); try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) { writer.setCodec(CodecFactory.fromString(codec)); writer.create(coder.getSchema(), os); int recordIndex = 0; int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; for (T elem : elems) { writer.append(elem); recordIndex++; switch (syncBehavior) { case SYNC_REGULAR: if (recordIndex == syncInterval) { recordIndex = 0; writer.sync(); } break; case SYNC_RANDOM: if (recordIndex == syncIndex) { recordIndex = 0; writer.sync(); syncIndex = random.nextInt(syncInterval); } break; case SYNC_DEFAULT: default: } } } return path; }
@Test public void testSplitsWithSmallBlocks() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); // Test reading from an object file with many small random-sized blocks. List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_RANDOM, 100 /* max records/block */, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); File file = new File(filename); // Small minimum bundle size AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L); // Assert that the source produces the expected records assertEquals(expected, SourceTestUtils.readFromSource(source, options)); List<? extends BoundedSource<Bird>> splits; int nonEmptySplits; // Split with the minimum bundle size splits = source.splitIntoBundles(100L, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); // Split with larger bundle size splits = source.splitIntoBundles(file.length() / 4, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); // Split with the file length splits = source.splitIntoBundles(file.length(), options); assertTrue(splits.size() == 1); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); }
@Test public void testSplitAtFractionExhaustive() throws Exception { List<FixedRecord> expected = createFixedRecords(50); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_REGULAR, 5, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); SourceTestUtils.assertSplitAtFractionExhaustive(source, null); }
@Override public PDone apply(PCollection<T> input) { if (filenamePrefix == null) { throw new IllegalStateException( "need to set the filename prefix of an AvroIO.Write transform"); } if (schema == null) { throw new IllegalStateException("need to set the schema of an AvroIO.Write transform"); } // Note that custom sinks currently do not expose sharding controls. // Thus pipeline runner writers need to individually add support internally to // apply user requested sharding limits. return input.apply( "Write", com.google.cloud.dataflow.sdk.io.Write.to( new AvroSink<>( filenamePrefix, filenameSuffix, shardTemplate, AvroCoder.of(type, schema)))); }
@Test public void testReadWithDifferentCodecs() throws Exception { // Test reading files generated using all codecs. String codecs[] = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC }; List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); for (String codec : codecs) { String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(expected, containsInAnyOrder(actual.toArray())); } }
@Test public void testMultipleFiles() throws Exception { String baseName = "tmp-"; List<Bird> expected = new ArrayList<>(); for (int i = 0; i < 10; i++) { List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10); expected.addAll(contents); generateTestFile( baseName + i, contents, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); } AvroSource<Bird> source = AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString()) .withSchema(Bird.class); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(actual, containsInAnyOrder(expected.toArray())); }
@Test public void testGetCurrentFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BlockBasedSource.BlockBasedReader<FixedRecord> reader = (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) { assertEquals(null, reader.getCurrentBlock()); expectedException.expect(NoSuchElementException.class); expectedException.expectMessage("No block has been successfully read from"); reader.getCurrent(); } }
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { dataFileWriter = new DataFileWriter<>(coder.createDatumWriter()); dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel)); }
@Override protected Coder<T> getDefaultOutputCoder() { return AvroCoder.of(type, schema); }