@Override public WindowedValue<T> next() throws IOException { Windmill.Message message = context.getWork().getMessageBundles(bundleIndex).getMessages(messageIndex); if (messageIndex >= context.getWork().getMessageBundles(bundleIndex).getMessagesCount() - 1) { messageIndex = 0; bundleIndex++; } else { messageIndex++; } Instant timestampMillis = new Instant(TimeUnit.MICROSECONDS.toMillis(message.getTimestamp())); InputStream data = message.getData().newInput(); InputStream metadata = message.getMetadata().newInput(); Collection<? extends BoundedWindow> windows = WindmillSink.decodeMetadataWindows(windowsCoder, message.getMetadata()); PaneInfo pane = WindmillSink.decodeMetadataPane(message.getMetadata()); if (valueCoder instanceof KvCoder) { KvCoder<?, ?> kvCoder = (KvCoder<?, ?>) valueCoder; InputStream key = context.getSerializedKey().newInput(); notifyElementRead(key.available() + data.available() + metadata.available()); @SuppressWarnings("unchecked") T result = (T) KV.of(decode(kvCoder.getKeyCoder(), key), decode(kvCoder.getValueCoder(), data)); return WindowedValue.of(result, timestampMillis, windows, pane); } else { notifyElementRead(data.available() + metadata.available()); return WindowedValue.of(decode(valueCoder, data), timestampMillis, windows, pane); } }
@Test public void testCombineValuesFnMerge() throws Exception { TestReceiver receiver = new TestReceiver(); MeanInts mean = new MeanInts(); Combine.KeyedCombineFn<String, Integer, MeanInts.CountSum, String> combiner = mean.asKeyedFn(); ParDoFn combineParDoFn = createCombineValuesFn(CombineValuesFn.CombinePhase.MERGE, combiner); combineParDoFn.startBundle(receiver); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow( KV.of( "a", Arrays.asList( mean.new CountSum(3, 6), mean.new CountSum(2, 9), mean.new CountSum(1, 12))))); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow( KV.of("b", Arrays.asList(mean.new CountSum(2, 20), mean.new CountSum(1, 1))))); combineParDoFn.finishBundle(); Object[] expectedReceivedElems = { WindowedValue.valueInGlobalWindow(KV.of("a", mean.new CountSum(6, 27))), WindowedValue.valueInGlobalWindow(KV.of("b", mean.new CountSum(3, 21))), }; assertArrayEquals(expectedReceivedElems, receiver.receivedElems.toArray()); }
@Test @SuppressWarnings("unchecked") public void testTop() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb")); DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c"); DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c"); DataflowAssert.that(largestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10))); DataflowAssert.that(smallestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10))); p.run(); }
/** Unit tests for {@link CombinePerKeyExamples}. */ @RunWith(JUnit4.class) public class CombinePerKeyExamplesTest { private static final TableRow row1 = new TableRow().set("corpus", "king_lear").set("word", "snuffleupaguses"); private static final TableRow row2 = new TableRow().set("corpus", "macbeth").set("word", "antidisestablishmentarianism"); private static final TableRow row3 = new TableRow().set("corpus", "king_lear").set("word", "antidisestablishmentarianism"); private static final TableRow row4 = new TableRow().set("corpus", "macbeth").set("word", "bob"); private static final TableRow row5 = new TableRow().set("corpus", "king_lear").set("word", "hi"); static final TableRow[] ROWS_ARRAY = new TableRow[] {row1, row2, row3, row4, row5}; private static final KV<String, String> tuple1 = KV.of("snuffleupaguses", "king_lear"); private static final KV<String, String> tuple2 = KV.of("antidisestablishmentarianism", "macbeth"); private static final KV<String, String> tuple3 = KV.of("antidisestablishmentarianism", "king_lear"); private static final KV<String, String> combinedTuple1 = KV.of("antidisestablishmentarianism", "king_lear,macbeth"); private static final KV<String, String> combinedTuple2 = KV.of("snuffleupaguses", "king_lear"); @SuppressWarnings({"unchecked", "rawtypes"}) static final KV<String, String>[] COMBINED_TUPLES_ARRAY = new KV[] {combinedTuple1, combinedTuple2}; private static final TableRow resultRow1 = new TableRow().set("word", "snuffleupaguses").set("all_plays", "king_lear"); private static final TableRow resultRow2 = new TableRow() .set("word", "antidisestablishmentarianism") .set("all_plays", "king_lear,macbeth"); @Test public void testExtractLargeWordsFn() { DoFnTester<TableRow, KV<String, String>> extractLargeWordsFn = DoFnTester.of(new ExtractLargeWordsFn()); List<KV<String, String>> results = extractLargeWordsFn.processBatch(ROWS_ARRAY); Assert.assertThat(results, CoreMatchers.hasItem(tuple1)); Assert.assertThat(results, CoreMatchers.hasItem(tuple2)); Assert.assertThat(results, CoreMatchers.hasItem(tuple3)); } @Test public void testFormatShakespeareOutputFn() { DoFnTester<KV<String, String>, TableRow> formatShakespeareOutputFn = DoFnTester.of(new FormatShakespeareOutputFn()); List<TableRow> results = formatShakespeareOutputFn.processBatch(COMBINED_TUPLES_ARRAY); Assert.assertThat(results, CoreMatchers.hasItem(resultRow1)); Assert.assertThat(results, CoreMatchers.hasItem(resultRow2)); } }
@Override public void processElement(ProcessContext c) { KV<K, Iterable<InputT>> kv = c.element(); K key = kv.getKey(); c.output(KV.of(key, this.combineFn.apply(key, kv.getValue()))); }
@Override public void processElement(ProcessContext c) { KV<K, AccumT> kv = c.element(); K key = kv.getKey(); OutputT output = this.combineFn.extractOutput(key, kv.getValue()); c.output(KV.of(key, output)); }
@Override public void processElement(ProcessContext c) { KV<K, Iterable<AccumT>> kv = c.element(); K key = kv.getKey(); AccumT accum = this.combineFn.mergeAccumulators(key, kv.getValue()); c.output(KV.of(key, accum)); }
@Override public void processElement(ProcessContext c) { KV<K, Iterable<InputT>> kv = c.element(); K key = kv.getKey(); AccumT accum = this.combineFn.createAccumulator(key); for (InputT input : kv.getValue()) { accum = this.combineFn.addInput(key, accum, input); } c.output(KV.of(key, accum)); }
private void closeWindow( K key, W w, Map<W, AccumT> accumulators, Map<W, Instant> minTimestamps, ProcessContext c) { AccumT accum = accumulators.remove(w); Instant timestamp = minTimestamps.remove(w); checkState(accum != null && timestamp != null); c.windowingInternals() .outputWindowedValue( KV.of(key, combineFn.extractOutput(key, accum)), timestamp, Arrays.asList(w), PaneInfo.ON_TIME_AND_ONLY_FIRING); }
@Test public void testCombineValuesFnAll() throws Exception { TestReceiver receiver = new TestReceiver(); Combine.KeyedCombineFn<String, Integer, MeanInts.CountSum, String> combiner = (new MeanInts()).asKeyedFn(); ParDoFn combineParDoFn = createCombineValuesFn(CombineValuesFn.CombinePhase.ALL, combiner); combineParDoFn.startBundle(receiver); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow(KV.of("a", Arrays.asList(5, 6, 7)))); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow(KV.of("b", Arrays.asList(1, 3, 7)))); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow(KV.of("c", Arrays.asList(3, 6, 8, 9)))); combineParDoFn.finishBundle(); Object[] expectedReceivedElems = { WindowedValue.valueInGlobalWindow(KV.of("a", String.format("%.1f", 6.0))), WindowedValue.valueInGlobalWindow(KV.of("b", String.format("%.1f", 3.7))), WindowedValue.valueInGlobalWindow(KV.of("c", String.format("%.1f", 6.5))), }; assertArrayEquals(expectedReceivedElems, receiver.receivedElems.toArray()); }
@Test public void testCombineValuesFnExtract() throws Exception { TestReceiver receiver = new TestReceiver(); MeanInts mean = new MeanInts(); Combine.KeyedCombineFn<String, Integer, MeanInts.CountSum, String> combiner = mean.asKeyedFn(); ParDoFn combineParDoFn = createCombineValuesFn(CombineValuesFn.CombinePhase.EXTRACT, combiner); combineParDoFn.startBundle(receiver); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow(KV.of("a", mean.new CountSum(6, 27)))); combineParDoFn.processElement( WindowedValue.valueInGlobalWindow(KV.of("b", mean.new CountSum(3, 21)))); combineParDoFn.finishBundle(); assertArrayEquals( new Object[] { WindowedValue.valueInGlobalWindow(KV.of("a", String.format("%.1f", 4.5))), WindowedValue.valueInGlobalWindow(KV.of("b", String.format("%.1f", 7.0))) }, receiver.receivedElems.toArray()); }
@Test public void testApproximateUniquePerKey() { List<KV<Long, Long>> elements = Lists.newArrayList(); List<Long> keys = ImmutableList.of(20L, 50L, 100L); int elementCount = 1000; int sampleSize = 100; // Use the key as the number of unique values. for (long uniqueCount : keys) { for (long value = 0; value < elementCount; value++) { elements.add(KV.of(uniqueCount, value % uniqueCount)); } } Pipeline p = TestPipeline.create(); PCollection<KV<Long, Long>> input = p.apply(Create.of(elements)); PCollection<KV<Long, Long>> counts = input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize)); DataflowAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
@Override public void processElement(ProcessContext c) { KV<K, Iterable<RawUnionValue>> e = c.element(); c.output(KV.of(e.getKey(), new CoGbkResult(schema, e.getValue()))); }
@Override public void processElement(ProcessContext c) { KV<K, ?> e = c.element(); c.output(KV.of(e.getKey(), new RawUnionValue(index, e.getValue()))); }
@Override public int hashCode() { return KV.of(count, sum).hashCode(); }
/** Unfortunately we need to copy the code from the Dataflow SDK because it is not public there. */ public class JoinExamplesITCase extends JavaProgramTestBase { protected String resultPath; public JoinExamplesITCase() {} private static final TableRow row1 = new TableRow() .set("ActionGeo_CountryCode", "VM") .set("SQLDATE", "20141212") .set("Actor1Name", "BANGKOK") .set("SOURCEURL", "http://cnn.com"); private static final TableRow row2 = new TableRow() .set("ActionGeo_CountryCode", "VM") .set("SQLDATE", "20141212") .set("Actor1Name", "LAOS") .set("SOURCEURL", "http://www.chicagotribune.com"); private static final TableRow row3 = new TableRow() .set("ActionGeo_CountryCode", "BE") .set("SQLDATE", "20141213") .set("Actor1Name", "AFGHANISTAN") .set("SOURCEURL", "http://cnn.com"); static final TableRow[] EVENTS = new TableRow[] {row1, row2, row3}; static final List<TableRow> EVENT_ARRAY = Arrays.asList(EVENTS); private static final KV<String, String> kv1 = KV.of("VM", "Date: 20141212, Actor1: LAOS, url: http://www.chicagotribune.com"); private static final KV<String, String> kv2 = KV.of("BE", "Date: 20141213, Actor1: AFGHANISTAN, url: http://cnn.com"); private static final KV<String, String> kv3 = KV.of("BE", "Belgium"); private static final KV<String, String> kv4 = KV.of("VM", "Vietnam"); private static final TableRow cc1 = new TableRow().set("FIPSCC", "VM").set("HumanName", "Vietnam"); private static final TableRow cc2 = new TableRow().set("FIPSCC", "BE").set("HumanName", "Belgium"); static final TableRow[] CCS = new TableRow[] {cc1, cc2}; static final List<TableRow> CC_ARRAY = Arrays.asList(CCS); static final String[] JOINED_EVENTS = new String[] { "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: LAOS, " + "url: http://www.chicagotribune.com", "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: BANGKOK, " + "url: http://cnn.com", "Country code: BE, Country name: Belgium, Event info: Date: 20141213, Actor1: AFGHANISTAN, " + "url: http://cnn.com" }; @Override protected void preSubmit() throws Exception { resultPath = getTempDirPath("result"); } @Override protected void postSubmit() throws Exception { compareResultsByLinesInMemory(Joiner.on('\n').join(JOINED_EVENTS), resultPath); } @Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.create(); PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY)); PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY)); PCollection<String> output = JoinExamples.joinEvents(input1, input2); output.apply(TextIO.Write.to(resultPath)); p.run(); } }
/** Tests for Top. */ @RunWith(JUnit4.class) public class TopTest { @Rule public ExpectedException expectedEx = ExpectedException.none(); @SuppressWarnings("unchecked") static final String[] COLLECTION = new String[] {"a", "bb", "c", "c", "z"}; @SuppressWarnings("unchecked") static final String[] EMPTY_COLLECTION = new String[] {}; @SuppressWarnings({"rawtypes", "unchecked"}) static final KV<String, Integer>[] TABLE = new KV[] { KV.of("a", 1), KV.of("a", 2), KV.of("a", 3), KV.of("b", 1), KV.of("b", 10), KV.of("b", 10), KV.of("b", 100), }; @SuppressWarnings({"rawtypes", "unchecked"}) static final KV<String, Integer>[] EMPTY_TABLE = new KV[] {}; public PCollection<KV<String, Integer>> createInputTable(Pipeline p) { return p.apply( "CreateInputTable", Create.of(Arrays.asList(TABLE)) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))); } public PCollection<KV<String, Integer>> createEmptyInputTable(Pipeline p) { return p.apply( "CreateEmptyInputTable", Create.of(Arrays.asList(EMPTY_TABLE)) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))); } @Test @SuppressWarnings("unchecked") public void testTop() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb")); DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c"); DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c"); DataflowAssert.that(largestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10))); DataflowAssert.that(smallestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10))); p.run(); } @Test @SuppressWarnings("unchecked") public void testTopEmpty() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).empty(); DataflowAssert.thatSingletonIterable(top2).empty(); DataflowAssert.thatSingletonIterable(top3).empty(); DataflowAssert.that(largestPerKey).empty(); DataflowAssert.that(smallestPerKey).empty(); p.run(); } @Test public void testTopEmptyWithIncompatibleWindows() { Pipeline p = TestPipeline.create(); Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L))); PCollection<String> input = p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList())) .apply(windowingFn); expectedEx.expect(IllegalStateException.class); expectedEx.expectMessage("Top"); expectedEx.expectMessage("GlobalWindows"); expectedEx.expectMessage("withoutDefaults"); expectedEx.expectMessage("asSingletonView"); input.apply(Top.of(1, new OrderByLength())); } @Test @SuppressWarnings("unchecked") public void testTopZero() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(0, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(0)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(0)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(0)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(0)); DataflowAssert.thatSingletonIterable(top1).empty(); DataflowAssert.thatSingletonIterable(top2).empty(); DataflowAssert.thatSingletonIterable(top3).empty(); DataflowAssert.that(largestPerKey) .containsInAnyOrder( KV.of("a", Arrays.<Integer>asList()), KV.of("b", Arrays.<Integer>asList())); DataflowAssert.that(smallestPerKey) .containsInAnyOrder( KV.of("a", Arrays.<Integer>asList()), KV.of("b", Arrays.<Integer>asList())); p.run(); } // This is a purely compile-time test. If the code compiles, then it worked. @Test public void testPerKeySerializabilityRequirement() { Pipeline p = TestPipeline.create(); p.apply( "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<KV<String, Integer>> inputTable = createInputTable(p); inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator())); inputTable.apply( "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2())); } @Test public void testCountConstraint() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); expectedEx.expect(IllegalArgumentException.class); expectedEx.expectMessage(Matchers.containsString(">= 0")); input.apply(Top.of(-1, new OrderByLength())); } @Test public void testTopGetNames() { assertEquals("Top.Globally", Top.of(1, new OrderByLength()).getName()); assertEquals("Smallest.Globally", Top.smallest(1).getName()); assertEquals("Largest.Globally", Top.largest(2).getName()); assertEquals("Top.PerKey", Top.perKey(1, new IntegerComparator()).getName()); assertEquals("Smallest.PerKey", Top.<String, Integer>smallestPerKey(1).getName()); assertEquals("Largest.PerKey", Top.<String, Integer>largestPerKey(2).getName()); } @Test public void testDisplayData() { Top.Largest<Integer> comparer = new Top.Largest<Integer>(); Combine.Globally<Integer, List<Integer>> top = Top.of(1234, comparer); DisplayData displayData = DisplayData.from(top); assertThat(displayData, hasDisplayItem("count", 1234)); assertThat(displayData, hasDisplayItem("comparer", comparer.getClass())); } private static class OrderByLength implements Comparator<String>, Serializable { @Override public int compare(String a, String b) { if (a.length() != b.length()) { return a.length() - b.length(); } else { return a.compareTo(b); } } } private static class IntegerComparator implements Comparator<Integer>, Serializable { @Override public int compare(Integer o1, Integer o2) { return o1.compareTo(o2); } } private static class IntegerComparator2 implements Comparator<Integer>, Serializable { @Override public int compare(Integer o1, Integer o2) { return o1.compareTo(o2); } } }