@Test
  public void testGetProgressFromUnstartedReader() throws Exception {
    List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            records,
            SyncBehavior.SYNC_DEFAULT,
            1000,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) {
      assertEquals(new Double(0.0), reader.getFractionConsumed());
    }

    List<? extends BoundedSource<FixedRecord>> splits =
        source.splitIntoBundles(file.length() / 3, null);
    for (BoundedSource<FixedRecord> subSource : splits) {
      try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) {
        assertEquals(new Double(0.0), reader.getFractionConsumed());
      }
    }
  }
  @Test
  public void testCreationWithSchema() throws Exception {
    List<Bird> expected = createRandomRecords(100);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_DEFAULT,
            0,
            AvroCoder.of(Bird.class),
            DataFileConstants.NULL_CODEC);

    // Create a source with a schema object
    Schema schema = ReflectData.get().getSchema(Bird.class);
    AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema);
    List<GenericRecord> records = SourceTestUtils.readFromSource(source, null);
    assertEqualsWithGeneric(expected, records);

    // Create a source with a JSON schema
    String schemaString = ReflectData.get().getSchema(Bird.class).toString();
    source = AvroSource.from(filename).withSchema(schemaString);
    records = SourceTestUtils.readFromSource(source, null);
    assertEqualsWithGeneric(expected, records);

    // Create a source with no schema
    source = AvroSource.from(filename);
    records = SourceTestUtils.readFromSource(source, null);
    assertEqualsWithGeneric(expected, records);
  }
  @Test
  public void testSplitAtFraction() throws Exception {
    List<FixedRecord> expected = createFixedRecords(DEFAULT_RECORD_COUNT);
    // Create an AvroSource where each block is 16k
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_REGULAR,
            1000,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    List<? extends BoundedSource<FixedRecord>> splits =
        source.splitIntoBundles(file.length() / 3, null);
    for (BoundedSource<FixedRecord> subSource : splits) {
      int items = SourceTestUtils.readFromSource(subSource, null).size();
      // Shouldn't split while unstarted.
      SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 100, 0.7, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1000, 0.1, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, 1001, 0.1, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null);
    }
  }
  /**
   * Generates an input Avro file containing the given records in the temporary directory and
   * returns the full path of the file.
   */
  private <T> String generateTestFile(
      String filename,
      List<T> elems,
      SyncBehavior syncBehavior,
      int syncInterval,
      AvroCoder<T> coder,
      String codec)
      throws IOException {
    Random random = new Random(0);
    File tmpFile = tmpFolder.newFile(filename);
    String path = tmpFile.toString();

    FileOutputStream os = new FileOutputStream(tmpFile);
    DatumWriter<T> datumWriter = coder.createDatumWriter();
    try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) {
      writer.setCodec(CodecFactory.fromString(codec));
      writer.create(coder.getSchema(), os);

      int recordIndex = 0;
      int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0;

      for (T elem : elems) {
        writer.append(elem);
        recordIndex++;

        switch (syncBehavior) {
          case SYNC_REGULAR:
            if (recordIndex == syncInterval) {
              recordIndex = 0;
              writer.sync();
            }
            break;
          case SYNC_RANDOM:
            if (recordIndex == syncIndex) {
              recordIndex = 0;
              writer.sync();
              syncIndex = random.nextInt(syncInterval);
            }
            break;
          case SYNC_DEFAULT:
          default:
        }
      }
    }
    return path;
  }
  @Test
  public void testSplitsWithSmallBlocks() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Test reading from an object file with many small random-sized blocks.
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_RANDOM,
            100 /* max records/block */,
            AvroCoder.of(Bird.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    // Small minimum bundle size
    AvroSource<Bird> source =
        AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L);

    // Assert that the source produces the expected records
    assertEquals(expected, SourceTestUtils.readFromSource(source, options));

    List<? extends BoundedSource<Bird>> splits;
    int nonEmptySplits;

    // Split with the minimum bundle size
    splits = source.splitIntoBundles(100L, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
      if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
        nonEmptySplits += 1;
      }
    }
    assertTrue(nonEmptySplits > 2);

    // Split with larger bundle size
    splits = source.splitIntoBundles(file.length() / 4, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
      if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
        nonEmptySplits += 1;
      }
    }
    assertTrue(nonEmptySplits > 2);

    // Split with the file length
    splits = source.splitIntoBundles(file.length(), options);
    assertTrue(splits.size() == 1);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
  }
  @Test
  public void testSplitAtFractionExhaustive() throws Exception {
    List<FixedRecord> expected = createFixedRecords(50);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_REGULAR,
            5,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    SourceTestUtils.assertSplitAtFractionExhaustive(source, null);
  }
Esempio n. 7
0
      @Override
      public PDone apply(PCollection<T> input) {
        if (filenamePrefix == null) {
          throw new IllegalStateException(
              "need to set the filename prefix of an AvroIO.Write transform");
        }
        if (schema == null) {
          throw new IllegalStateException("need to set the schema of an AvroIO.Write transform");
        }

        // Note that custom sinks currently do not expose sharding controls.
        // Thus pipeline runner writers need to individually add support internally to
        // apply user requested sharding limits.
        return input.apply(
            "Write",
            com.google.cloud.dataflow.sdk.io.Write.to(
                new AvroSink<>(
                    filenamePrefix, filenameSuffix, shardTemplate, AvroCoder.of(type, schema))));
      }
  @Test
  public void testReadWithDifferentCodecs() throws Exception {
    // Test reading files generated using all codecs.
    String codecs[] = {
      DataFileConstants.NULL_CODEC,
      DataFileConstants.BZIP2_CODEC,
      DataFileConstants.DEFLATE_CODEC,
      DataFileConstants.SNAPPY_CODEC,
      DataFileConstants.XZ_CODEC
    };
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);

    for (String codec : codecs) {
      String filename =
          generateTestFile(
              codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
      AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class);
      List<Bird> actual = SourceTestUtils.readFromSource(source, null);
      assertThat(expected, containsInAnyOrder(actual.toArray()));
    }
  }
  @Test
  public void testMultipleFiles() throws Exception {
    String baseName = "tmp-";
    List<Bird> expected = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
      List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10);
      expected.addAll(contents);
      generateTestFile(
          baseName + i,
          contents,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);
    }

    AvroSource<Bird> source =
        AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString())
            .withSchema(Bird.class);
    List<Bird> actual = SourceTestUtils.readFromSource(source, null);
    assertThat(actual, containsInAnyOrder(expected.toArray()));
  }
  @Test
  public void testGetCurrentFromUnstartedReader() throws Exception {
    List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            records,
            SyncBehavior.SYNC_DEFAULT,
            1000,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    try (BlockBasedSource.BlockBasedReader<FixedRecord> reader =
        (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) {
      assertEquals(null, reader.getCurrentBlock());

      expectedException.expect(NoSuchElementException.class);
      expectedException.expectMessage("No block has been successfully read from");
      reader.getCurrent();
    }
  }
Esempio n. 11
0
 @SuppressWarnings("deprecation") // uses internal test functionality.
 @Override
 protected void prepareWrite(WritableByteChannel channel) throws Exception {
   dataFileWriter = new DataFileWriter<>(coder.createDatumWriter());
   dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel));
 }
Esempio n. 12
0
 @Override
 protected Coder<T> getDefaultOutputCoder() {
   return AvroCoder.of(type, schema);
 }