Exemplo n.º 1
0
  @Test
  public void testGetProgressFromUnstartedReader() throws Exception {
    List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            records,
            SyncBehavior.SYNC_DEFAULT,
            1000,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) {
      assertEquals(new Double(0.0), reader.getFractionConsumed());
    }

    List<? extends BoundedSource<FixedRecord>> splits =
        source.splitIntoBundles(file.length() / 3, null);
    for (BoundedSource<FixedRecord> subSource : splits) {
      try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) {
        assertEquals(new Double(0.0), reader.getFractionConsumed());
      }
    }
  }
Exemplo n.º 2
0
  @Test
  public void testCreationWithSchema() throws Exception {
    List<Bird> expected = createRandomRecords(100);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_DEFAULT,
            0,
            AvroCoder.of(Bird.class),
            DataFileConstants.NULL_CODEC);

    // Create a source with a schema object
    Schema schema = ReflectData.get().getSchema(Bird.class);
    AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema);
    List<GenericRecord> records = SourceTestUtils.readFromSource(source, null);
    assertEqualsWithGeneric(expected, records);

    // Create a source with a JSON schema
    String schemaString = ReflectData.get().getSchema(Bird.class).toString();
    source = AvroSource.from(filename).withSchema(schemaString);
    records = SourceTestUtils.readFromSource(source, null);
    assertEqualsWithGeneric(expected, records);

    // Create a source with no schema
    source = AvroSource.from(filename);
    records = SourceTestUtils.readFromSource(source, null);
    assertEqualsWithGeneric(expected, records);
  }
Exemplo n.º 3
0
  @Test
  public void testSplitAtFraction() throws Exception {
    List<FixedRecord> expected = createFixedRecords(DEFAULT_RECORD_COUNT);
    // Create an AvroSource where each block is 16k
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_REGULAR,
            1000,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    List<? extends BoundedSource<FixedRecord>> splits =
        source.splitIntoBundles(file.length() / 3, null);
    for (BoundedSource<FixedRecord> subSource : splits) {
      int items = SourceTestUtils.readFromSource(subSource, null).size();
      // Shouldn't split while unstarted.
      SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 100, 0.7, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1000, 0.1, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, 1001, 0.1, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null);
      SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null);
      SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null);
    }
  }
Exemplo n.º 4
0
      @Override
      public PCollection<T> apply(PInput input) {
        if (filepattern == null) {
          throw new IllegalStateException(
              "need to set the filepattern of an AvroIO.Read transform");
        }
        if (schema == null) {
          throw new IllegalStateException("need to set the schema of an AvroIO.Read transform");
        }
        if (validate) {
          try {
            checkState(
                !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(),
                "Unable to find any files matching %s",
                filepattern);
          } catch (IOException e) {
            throw new IllegalStateException(String.format("Failed to validate %s", filepattern), e);
          }
        }

        @SuppressWarnings("unchecked")
        Bounded<T> read =
            type == GenericRecord.class
                ? (Bounded<T>)
                    com.google.cloud.dataflow.sdk.io.Read.from(
                        AvroSource.from(filepattern).withSchema(schema))
                : com.google.cloud.dataflow.sdk.io.Read.from(
                    AvroSource.from(filepattern).withSchema(type));

        PCollection<T> pcol = input.getPipeline().apply("Read", read);
        // Honor the default output coder that would have been used by this PTransform.
        pcol.setCoder(getDefaultOutputCoder());
        return pcol;
      }
Exemplo n.º 5
0
  @Test
  public void testSplitsWithSmallBlocks() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Test reading from an object file with many small random-sized blocks.
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_RANDOM,
            100 /* max records/block */,
            AvroCoder.of(Bird.class),
            DataFileConstants.NULL_CODEC);
    File file = new File(filename);

    // Small minimum bundle size
    AvroSource<Bird> source =
        AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L);

    // Assert that the source produces the expected records
    assertEquals(expected, SourceTestUtils.readFromSource(source, options));

    List<? extends BoundedSource<Bird>> splits;
    int nonEmptySplits;

    // Split with the minimum bundle size
    splits = source.splitIntoBundles(100L, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
      if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
        nonEmptySplits += 1;
      }
    }
    assertTrue(nonEmptySplits > 2);

    // Split with larger bundle size
    splits = source.splitIntoBundles(file.length() / 4, options);
    assertTrue(splits.size() > 2);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    nonEmptySplits = 0;
    for (BoundedSource<Bird> subSource : splits) {
      if (SourceTestUtils.readFromSource(subSource, options).size() > 0) {
        nonEmptySplits += 1;
      }
    }
    assertTrue(nonEmptySplits > 2);

    // Split with the file length
    splits = source.splitIntoBundles(file.length(), options);
    assertTrue(splits.size() == 1);
    SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
  }
Exemplo n.º 6
0
  @Test
  public void testGetCurrentFromUnstartedReader() throws Exception {
    List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
    String filename =
        generateTestFile(
            "tmp.avro",
            records,
            SyncBehavior.SYNC_DEFAULT,
            1000,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    try (BlockBasedSource.BlockBasedReader<FixedRecord> reader =
        (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) {
      assertEquals(null, reader.getCurrentBlock());

      expectedException.expect(NoSuchElementException.class);
      expectedException.expectMessage("No block has been successfully read from");
      reader.getCurrent();
    }
  }
Exemplo n.º 7
0
  @Test
  public void testSplitAtFractionExhaustive() throws Exception {
    List<FixedRecord> expected = createFixedRecords(50);
    String filename =
        generateTestFile(
            "tmp.avro",
            expected,
            SyncBehavior.SYNC_REGULAR,
            5,
            AvroCoder.of(FixedRecord.class),
            DataFileConstants.NULL_CODEC);

    AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
    SourceTestUtils.assertSplitAtFractionExhaustive(source, null);
  }
Exemplo n.º 8
0
  @Test
  public void testReadWithDifferentCodecs() throws Exception {
    // Test reading files generated using all codecs.
    String codecs[] = {
      DataFileConstants.NULL_CODEC,
      DataFileConstants.BZIP2_CODEC,
      DataFileConstants.DEFLATE_CODEC,
      DataFileConstants.SNAPPY_CODEC,
      DataFileConstants.XZ_CODEC
    };
    List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);

    for (String codec : codecs) {
      String filename =
          generateTestFile(
              codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
      AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class);
      List<Bird> actual = SourceTestUtils.readFromSource(source, null);
      assertThat(expected, containsInAnyOrder(actual.toArray()));
    }
  }
Exemplo n.º 9
0
  @Test
  public void testMultipleFiles() throws Exception {
    String baseName = "tmp-";
    List<Bird> expected = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
      List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10);
      expected.addAll(contents);
      generateTestFile(
          baseName + i,
          contents,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);
    }

    AvroSource<Bird> source =
        AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString())
            .withSchema(Bird.class);
    List<Bird> actual = SourceTestUtils.readFromSource(source, null);
    assertThat(actual, containsInAnyOrder(expected.toArray()));
  }