@Test public void testGetProgressFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) { assertEquals(new Double(0.0), reader.getFractionConsumed()); } List<? extends BoundedSource<FixedRecord>> splits = source.splitIntoBundles(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) { assertEquals(new Double(0.0), reader.getFractionConsumed()); } } }
@Test public void testCreationWithSchema() throws Exception { List<Bird> expected = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); // Create a source with a schema object Schema schema = ReflectData.get().getSchema(Bird.class); AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema); List<GenericRecord> records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); // Create a source with a JSON schema String schemaString = ReflectData.get().getSchema(Bird.class).toString(); source = AvroSource.from(filename).withSchema(schemaString); records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); // Create a source with no schema source = AvroSource.from(filename); records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); }
@Test public void testSplitAtFraction() throws Exception { List<FixedRecord> expected = createFixedRecords(DEFAULT_RECORD_COUNT); // Create an AvroSource where each block is 16k String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_REGULAR, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); List<? extends BoundedSource<FixedRecord>> splits = source.splitIntoBundles(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { int items = SourceTestUtils.readFromSource(subSource, null).size(); // Shouldn't split while unstarted. SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null); SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 100, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1000, 0.1, null); SourceTestUtils.assertSplitAtFractionFails(subSource, 1001, 0.1, null); SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null); } }
@Override public PCollection<T> apply(PInput input) { if (filepattern == null) { throw new IllegalStateException( "need to set the filepattern of an AvroIO.Read transform"); } if (schema == null) { throw new IllegalStateException("need to set the schema of an AvroIO.Read transform"); } if (validate) { try { checkState( !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(), "Unable to find any files matching %s", filepattern); } catch (IOException e) { throw new IllegalStateException(String.format("Failed to validate %s", filepattern), e); } } @SuppressWarnings("unchecked") Bounded<T> read = type == GenericRecord.class ? (Bounded<T>) com.google.cloud.dataflow.sdk.io.Read.from( AvroSource.from(filepattern).withSchema(schema)) : com.google.cloud.dataflow.sdk.io.Read.from( AvroSource.from(filepattern).withSchema(type)); PCollection<T> pcol = input.getPipeline().apply("Read", read); // Honor the default output coder that would have been used by this PTransform. pcol.setCoder(getDefaultOutputCoder()); return pcol; }
@Test public void testSplitsWithSmallBlocks() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); // Test reading from an object file with many small random-sized blocks. List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_RANDOM, 100 /* max records/block */, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); File file = new File(filename); // Small minimum bundle size AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L); // Assert that the source produces the expected records assertEquals(expected, SourceTestUtils.readFromSource(source, options)); List<? extends BoundedSource<Bird>> splits; int nonEmptySplits; // Split with the minimum bundle size splits = source.splitIntoBundles(100L, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); // Split with larger bundle size splits = source.splitIntoBundles(file.length() / 4, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); // Split with the file length splits = source.splitIntoBundles(file.length(), options); assertTrue(splits.size() == 1); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); }
@Test public void testGetCurrentFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BlockBasedSource.BlockBasedReader<FixedRecord> reader = (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) { assertEquals(null, reader.getCurrentBlock()); expectedException.expect(NoSuchElementException.class); expectedException.expectMessage("No block has been successfully read from"); reader.getCurrent(); } }
@Test public void testSplitAtFractionExhaustive() throws Exception { List<FixedRecord> expected = createFixedRecords(50); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_REGULAR, 5, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); SourceTestUtils.assertSplitAtFractionExhaustive(source, null); }
@Test public void testReadWithDifferentCodecs() throws Exception { // Test reading files generated using all codecs. String codecs[] = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC }; List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); for (String codec : codecs) { String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(expected, containsInAnyOrder(actual.toArray())); } }
@Test public void testMultipleFiles() throws Exception { String baseName = "tmp-"; List<Bird> expected = new ArrayList<>(); for (int i = 0; i < 10; i++) { List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10); expected.addAll(contents); generateTestFile( baseName + i, contents, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); } AvroSource<Bird> source = AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString()) .withSchema(Bird.class); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(actual, containsInAnyOrder(expected.toArray())); }