@Override
    public WindowedValue<T> next() throws IOException {
      Windmill.Message message =
          context.getWork().getMessageBundles(bundleIndex).getMessages(messageIndex);
      if (messageIndex >= context.getWork().getMessageBundles(bundleIndex).getMessagesCount() - 1) {
        messageIndex = 0;
        bundleIndex++;
      } else {
        messageIndex++;
      }
      Instant timestampMillis = new Instant(TimeUnit.MICROSECONDS.toMillis(message.getTimestamp()));
      InputStream data = message.getData().newInput();
      InputStream metadata = message.getMetadata().newInput();
      Collection<? extends BoundedWindow> windows =
          WindmillSink.decodeMetadataWindows(windowsCoder, message.getMetadata());
      PaneInfo pane = WindmillSink.decodeMetadataPane(message.getMetadata());
      if (valueCoder instanceof KvCoder) {
        KvCoder<?, ?> kvCoder = (KvCoder<?, ?>) valueCoder;
        InputStream key = context.getSerializedKey().newInput();
        notifyElementRead(key.available() + data.available() + metadata.available());

        @SuppressWarnings("unchecked")
        T result =
            (T) KV.of(decode(kvCoder.getKeyCoder(), key), decode(kvCoder.getValueCoder(), data));
        return WindowedValue.of(result, timestampMillis, windows, pane);
      } else {
        notifyElementRead(data.available() + metadata.available());
        return WindowedValue.of(decode(valueCoder, data), timestampMillis, windows, pane);
      }
    }
  @Test
  public void testCombineValuesFnMerge() throws Exception {
    TestReceiver receiver = new TestReceiver();
    MeanInts mean = new MeanInts();

    Combine.KeyedCombineFn<String, Integer, MeanInts.CountSum, String> combiner = mean.asKeyedFn();

    ParDoFn combineParDoFn = createCombineValuesFn(CombineValuesFn.CombinePhase.MERGE, combiner);

    combineParDoFn.startBundle(receiver);
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(
            KV.of(
                "a",
                Arrays.asList(
                    mean.new CountSum(3, 6), mean.new CountSum(2, 9), mean.new CountSum(1, 12)))));
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(
            KV.of("b", Arrays.asList(mean.new CountSum(2, 20), mean.new CountSum(1, 1)))));
    combineParDoFn.finishBundle();

    Object[] expectedReceivedElems = {
      WindowedValue.valueInGlobalWindow(KV.of("a", mean.new CountSum(6, 27))),
      WindowedValue.valueInGlobalWindow(KV.of("b", mean.new CountSum(3, 21))),
    };
    assertArrayEquals(expectedReceivedElems, receiver.receivedElems.toArray());
  }
Beispiel #3
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTop() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb"));
    DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c");
    DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c");
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10)));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10)));

    p.run();
  }
/** Unit tests for {@link CombinePerKeyExamples}. */
@RunWith(JUnit4.class)
public class CombinePerKeyExamplesTest {

  private static final TableRow row1 =
      new TableRow().set("corpus", "king_lear").set("word", "snuffleupaguses");
  private static final TableRow row2 =
      new TableRow().set("corpus", "macbeth").set("word", "antidisestablishmentarianism");
  private static final TableRow row3 =
      new TableRow().set("corpus", "king_lear").set("word", "antidisestablishmentarianism");
  private static final TableRow row4 = new TableRow().set("corpus", "macbeth").set("word", "bob");
  private static final TableRow row5 = new TableRow().set("corpus", "king_lear").set("word", "hi");

  static final TableRow[] ROWS_ARRAY = new TableRow[] {row1, row2, row3, row4, row5};

  private static final KV<String, String> tuple1 = KV.of("snuffleupaguses", "king_lear");
  private static final KV<String, String> tuple2 = KV.of("antidisestablishmentarianism", "macbeth");
  private static final KV<String, String> tuple3 =
      KV.of("antidisestablishmentarianism", "king_lear");

  private static final KV<String, String> combinedTuple1 =
      KV.of("antidisestablishmentarianism", "king_lear,macbeth");
  private static final KV<String, String> combinedTuple2 = KV.of("snuffleupaguses", "king_lear");

  @SuppressWarnings({"unchecked", "rawtypes"})
  static final KV<String, String>[] COMBINED_TUPLES_ARRAY =
      new KV[] {combinedTuple1, combinedTuple2};

  private static final TableRow resultRow1 =
      new TableRow().set("word", "snuffleupaguses").set("all_plays", "king_lear");
  private static final TableRow resultRow2 =
      new TableRow()
          .set("word", "antidisestablishmentarianism")
          .set("all_plays", "king_lear,macbeth");

  @Test
  public void testExtractLargeWordsFn() {
    DoFnTester<TableRow, KV<String, String>> extractLargeWordsFn =
        DoFnTester.of(new ExtractLargeWordsFn());
    List<KV<String, String>> results = extractLargeWordsFn.processBatch(ROWS_ARRAY);
    Assert.assertThat(results, CoreMatchers.hasItem(tuple1));
    Assert.assertThat(results, CoreMatchers.hasItem(tuple2));
    Assert.assertThat(results, CoreMatchers.hasItem(tuple3));
  }

  @Test
  public void testFormatShakespeareOutputFn() {
    DoFnTester<KV<String, String>, TableRow> formatShakespeareOutputFn =
        DoFnTester.of(new FormatShakespeareOutputFn());
    List<TableRow> results = formatShakespeareOutputFn.processBatch(COMBINED_TUPLES_ARRAY);
    Assert.assertThat(results, CoreMatchers.hasItem(resultRow1));
    Assert.assertThat(results, CoreMatchers.hasItem(resultRow2));
  }
}
    @Override
    public void processElement(ProcessContext c) {
      KV<K, Iterable<InputT>> kv = c.element();
      K key = kv.getKey();

      c.output(KV.of(key, this.combineFn.apply(key, kv.getValue())));
    }
    @Override
    public void processElement(ProcessContext c) {
      KV<K, AccumT> kv = c.element();
      K key = kv.getKey();
      OutputT output = this.combineFn.extractOutput(key, kv.getValue());

      c.output(KV.of(key, output));
    }
    @Override
    public void processElement(ProcessContext c) {
      KV<K, Iterable<AccumT>> kv = c.element();
      K key = kv.getKey();
      AccumT accum = this.combineFn.mergeAccumulators(key, kv.getValue());

      c.output(KV.of(key, accum));
    }
    @Override
    public void processElement(ProcessContext c) {
      KV<K, Iterable<InputT>> kv = c.element();
      K key = kv.getKey();
      AccumT accum = this.combineFn.createAccumulator(key);
      for (InputT input : kv.getValue()) {
        accum = this.combineFn.addInput(key, accum, input);
      }

      c.output(KV.of(key, accum));
    }
 private void closeWindow(
     K key, W w, Map<W, AccumT> accumulators, Map<W, Instant> minTimestamps, ProcessContext c) {
   AccumT accum = accumulators.remove(w);
   Instant timestamp = minTimestamps.remove(w);
   checkState(accum != null && timestamp != null);
   c.windowingInternals()
       .outputWindowedValue(
           KV.of(key, combineFn.extractOutput(key, accum)),
           timestamp,
           Arrays.asList(w),
           PaneInfo.ON_TIME_AND_ONLY_FIRING);
 }
  @Test
  public void testCombineValuesFnAll() throws Exception {
    TestReceiver receiver = new TestReceiver();

    Combine.KeyedCombineFn<String, Integer, MeanInts.CountSum, String> combiner =
        (new MeanInts()).asKeyedFn();

    ParDoFn combineParDoFn = createCombineValuesFn(CombineValuesFn.CombinePhase.ALL, combiner);

    combineParDoFn.startBundle(receiver);
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(KV.of("a", Arrays.asList(5, 6, 7))));
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(KV.of("b", Arrays.asList(1, 3, 7))));
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(KV.of("c", Arrays.asList(3, 6, 8, 9))));
    combineParDoFn.finishBundle();

    Object[] expectedReceivedElems = {
      WindowedValue.valueInGlobalWindow(KV.of("a", String.format("%.1f", 6.0))),
      WindowedValue.valueInGlobalWindow(KV.of("b", String.format("%.1f", 3.7))),
      WindowedValue.valueInGlobalWindow(KV.of("c", String.format("%.1f", 6.5))),
    };
    assertArrayEquals(expectedReceivedElems, receiver.receivedElems.toArray());
  }
  @Test
  public void testCombineValuesFnExtract() throws Exception {
    TestReceiver receiver = new TestReceiver();
    MeanInts mean = new MeanInts();

    Combine.KeyedCombineFn<String, Integer, MeanInts.CountSum, String> combiner = mean.asKeyedFn();

    ParDoFn combineParDoFn = createCombineValuesFn(CombineValuesFn.CombinePhase.EXTRACT, combiner);

    combineParDoFn.startBundle(receiver);
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(KV.of("a", mean.new CountSum(6, 27))));
    combineParDoFn.processElement(
        WindowedValue.valueInGlobalWindow(KV.of("b", mean.new CountSum(3, 21))));
    combineParDoFn.finishBundle();

    assertArrayEquals(
        new Object[] {
          WindowedValue.valueInGlobalWindow(KV.of("a", String.format("%.1f", 4.5))),
          WindowedValue.valueInGlobalWindow(KV.of("b", String.format("%.1f", 7.0)))
        },
        receiver.receivedElems.toArray());
  }
  @Test
  public void testApproximateUniquePerKey() {
    List<KV<Long, Long>> elements = Lists.newArrayList();
    List<Long> keys = ImmutableList.of(20L, 50L, 100L);
    int elementCount = 1000;
    int sampleSize = 100;
    // Use the key as the number of unique values.
    for (long uniqueCount : keys) {
      for (long value = 0; value < elementCount; value++) {
        elements.add(KV.of(uniqueCount, value % uniqueCount));
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<KV<Long, Long>> input = p.apply(Create.of(elements));
    PCollection<KV<Long, Long>> counts =
        input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize));

    DataflowAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
 @Override
 public void processElement(ProcessContext c) {
   KV<K, Iterable<RawUnionValue>> e = c.element();
   c.output(KV.of(e.getKey(), new CoGbkResult(schema, e.getValue())));
 }
 @Override
 public void processElement(ProcessContext c) {
   KV<K, ?> e = c.element();
   c.output(KV.of(e.getKey(), new RawUnionValue(index, e.getValue())));
 }
 @Override
 public int hashCode() {
   return KV.of(count, sum).hashCode();
 }
/** Unfortunately we need to copy the code from the Dataflow SDK because it is not public there. */
public class JoinExamplesITCase extends JavaProgramTestBase {

  protected String resultPath;

  public JoinExamplesITCase() {}

  private static final TableRow row1 =
      new TableRow()
          .set("ActionGeo_CountryCode", "VM")
          .set("SQLDATE", "20141212")
          .set("Actor1Name", "BANGKOK")
          .set("SOURCEURL", "http://cnn.com");
  private static final TableRow row2 =
      new TableRow()
          .set("ActionGeo_CountryCode", "VM")
          .set("SQLDATE", "20141212")
          .set("Actor1Name", "LAOS")
          .set("SOURCEURL", "http://www.chicagotribune.com");
  private static final TableRow row3 =
      new TableRow()
          .set("ActionGeo_CountryCode", "BE")
          .set("SQLDATE", "20141213")
          .set("Actor1Name", "AFGHANISTAN")
          .set("SOURCEURL", "http://cnn.com");
  static final TableRow[] EVENTS = new TableRow[] {row1, row2, row3};
  static final List<TableRow> EVENT_ARRAY = Arrays.asList(EVENTS);

  private static final KV<String, String> kv1 =
      KV.of("VM", "Date: 20141212, Actor1: LAOS, url: http://www.chicagotribune.com");
  private static final KV<String, String> kv2 =
      KV.of("BE", "Date: 20141213, Actor1: AFGHANISTAN, url: http://cnn.com");
  private static final KV<String, String> kv3 = KV.of("BE", "Belgium");
  private static final KV<String, String> kv4 = KV.of("VM", "Vietnam");

  private static final TableRow cc1 =
      new TableRow().set("FIPSCC", "VM").set("HumanName", "Vietnam");
  private static final TableRow cc2 =
      new TableRow().set("FIPSCC", "BE").set("HumanName", "Belgium");
  static final TableRow[] CCS = new TableRow[] {cc1, cc2};
  static final List<TableRow> CC_ARRAY = Arrays.asList(CCS);

  static final String[] JOINED_EVENTS =
      new String[] {
        "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: LAOS, "
            + "url: http://www.chicagotribune.com",
        "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: BANGKOK, "
            + "url: http://cnn.com",
        "Country code: BE, Country name: Belgium, Event info: Date: 20141213, Actor1: AFGHANISTAN, "
            + "url: http://cnn.com"
      };

  @Override
  protected void preSubmit() throws Exception {
    resultPath = getTempDirPath("result");
  }

  @Override
  protected void postSubmit() throws Exception {
    compareResultsByLinesInMemory(Joiner.on('\n').join(JOINED_EVENTS), resultPath);
  }

  @Override
  protected void testProgram() throws Exception {

    Pipeline p = FlinkTestPipeline.create();

    PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
    PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

    PCollection<String> output = JoinExamples.joinEvents(input1, input2);

    output.apply(TextIO.Write.to(resultPath));

    p.run();
  }
}
Beispiel #17
0
/** Tests for Top. */
@RunWith(JUnit4.class)
public class TopTest {

  @Rule public ExpectedException expectedEx = ExpectedException.none();

  @SuppressWarnings("unchecked")
  static final String[] COLLECTION = new String[] {"a", "bb", "c", "c", "z"};

  @SuppressWarnings("unchecked")
  static final String[] EMPTY_COLLECTION = new String[] {};

  @SuppressWarnings({"rawtypes", "unchecked"})
  static final KV<String, Integer>[] TABLE =
      new KV[] {
        KV.of("a", 1),
        KV.of("a", 2),
        KV.of("a", 3),
        KV.of("b", 1),
        KV.of("b", 10),
        KV.of("b", 10),
        KV.of("b", 100),
      };

  @SuppressWarnings({"rawtypes", "unchecked"})
  static final KV<String, Integer>[] EMPTY_TABLE = new KV[] {};

  public PCollection<KV<String, Integer>> createInputTable(Pipeline p) {
    return p.apply(
        "CreateInputTable",
        Create.of(Arrays.asList(TABLE))
            .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())));
  }

  public PCollection<KV<String, Integer>> createEmptyInputTable(Pipeline p) {
    return p.apply(
        "CreateEmptyInputTable",
        Create.of(Arrays.asList(EMPTY_TABLE))
            .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())));
  }

  @Test
  @SuppressWarnings("unchecked")
  public void testTop() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb"));
    DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c");
    DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c");
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10)));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10)));

    p.run();
  }

  @Test
  @SuppressWarnings("unchecked")
  public void testTopEmpty() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).empty();
    DataflowAssert.thatSingletonIterable(top2).empty();
    DataflowAssert.thatSingletonIterable(top3).empty();
    DataflowAssert.that(largestPerKey).empty();
    DataflowAssert.that(smallestPerKey).empty();

    p.run();
  }

  @Test
  public void testTopEmptyWithIncompatibleWindows() {
    Pipeline p = TestPipeline.create();
    Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L)));
    PCollection<String> input =
        p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList()))
            .apply(windowingFn);

    expectedEx.expect(IllegalStateException.class);
    expectedEx.expectMessage("Top");
    expectedEx.expectMessage("GlobalWindows");
    expectedEx.expectMessage("withoutDefaults");
    expectedEx.expectMessage("asSingletonView");

    input.apply(Top.of(1, new OrderByLength()));
  }

  @Test
  @SuppressWarnings("unchecked")
  public void testTopZero() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(0, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(0));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(0));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(0));

    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(0));

    DataflowAssert.thatSingletonIterable(top1).empty();
    DataflowAssert.thatSingletonIterable(top2).empty();
    DataflowAssert.thatSingletonIterable(top3).empty();
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(
            KV.of("a", Arrays.<Integer>asList()), KV.of("b", Arrays.<Integer>asList()));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(
            KV.of("a", Arrays.<Integer>asList()), KV.of("b", Arrays.<Integer>asList()));

    p.run();
  }

  // This is a purely compile-time test.  If the code compiles, then it worked.
  @Test
  public void testPerKeySerializabilityRequirement() {
    Pipeline p = TestPipeline.create();
    p.apply(
        "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator()));

    inputTable.apply(
        "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2()));
  }

  @Test
  public void testCountConstraint() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    expectedEx.expect(IllegalArgumentException.class);
    expectedEx.expectMessage(Matchers.containsString(">= 0"));

    input.apply(Top.of(-1, new OrderByLength()));
  }

  @Test
  public void testTopGetNames() {
    assertEquals("Top.Globally", Top.of(1, new OrderByLength()).getName());
    assertEquals("Smallest.Globally", Top.smallest(1).getName());
    assertEquals("Largest.Globally", Top.largest(2).getName());
    assertEquals("Top.PerKey", Top.perKey(1, new IntegerComparator()).getName());
    assertEquals("Smallest.PerKey", Top.<String, Integer>smallestPerKey(1).getName());
    assertEquals("Largest.PerKey", Top.<String, Integer>largestPerKey(2).getName());
  }

  @Test
  public void testDisplayData() {
    Top.Largest<Integer> comparer = new Top.Largest<Integer>();
    Combine.Globally<Integer, List<Integer>> top = Top.of(1234, comparer);
    DisplayData displayData = DisplayData.from(top);

    assertThat(displayData, hasDisplayItem("count", 1234));
    assertThat(displayData, hasDisplayItem("comparer", comparer.getClass()));
  }

  private static class OrderByLength implements Comparator<String>, Serializable {
    @Override
    public int compare(String a, String b) {
      if (a.length() != b.length()) {
        return a.length() - b.length();
      } else {
        return a.compareTo(b);
      }
    }
  }

  private static class IntegerComparator implements Comparator<Integer>, Serializable {
    @Override
    public int compare(Integer o1, Integer o2) {
      return o1.compareTo(o2);
    }
  }

  private static class IntegerComparator2 implements Comparator<Integer>, Serializable {
    @Override
    public int compare(Integer o1, Integer o2) {
      return o1.compareTo(o2);
    }
  }
}