/** * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within * the maximum allowed error of {@code 2/sqrt(sampleSize)}. */ private static void runApproximateUniquePipeline(int sampleSize) { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(TEST_LINES)); PCollection<Long> approximate = input.apply(ApproximateUnique.<String>globally(sampleSize)); final PCollectionView<Long> exact = input .apply(RemoveDuplicates.<String>create()) .apply(Count.<String>globally()) .apply(View.<Long>asSingleton()); PCollection<KV<Long, Long>> approximateAndExact = approximate.apply( ParDo.of( new DoFn<Long, KV<Long, Long>>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(exact))); } }) .withSideInputs(exact)); DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
@Test @SuppressWarnings("unchecked") public void testTopEmpty() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).empty(); DataflowAssert.thatSingletonIterable(top2).empty(); DataflowAssert.thatSingletonIterable(top3).empty(); DataflowAssert.that(largestPerKey).empty(); DataflowAssert.that(smallestPerKey).empty(); p.run(); }
@Test @SuppressWarnings("unchecked") public void testTop() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb")); DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c"); DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c"); DataflowAssert.that(largestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10))); DataflowAssert.that(smallestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10))); p.run(); }
@Test public void testCountConstraint() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); expectedEx.expect(IllegalArgumentException.class); expectedEx.expectMessage(Matchers.containsString(">= 0")); input.apply(Top.of(-1, new OrderByLength())); }
/** Reads a large {@code PCollection<String>}. */ private PCollection<String> readPCollection(Pipeline p) { // TODO: Read PCollection from a set of text files. List<String> page = TestUtils.LINES; final int pages = 1000; ArrayList<String> file = new ArrayList<>(pages * page.size()); for (int i = 0; i < pages; i++) { file.addAll(page); } assert file.size() == pages * page.size(); PCollection<String> words = p.apply(Create.of(file)); return words; }
// This is a purely compile-time test. If the code compiles, then it worked. @Test public void testPerKeySerializabilityRequirement() { Pipeline p = TestPipeline.create(); p.apply( "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<KV<String, Integer>> inputTable = createInputTable(p); inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator())); inputTable.apply( "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2())); }
@Test @Category(RunnableOnService.class) public void testApproximateUniqueWithSmallInput() { Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3))); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000)); DataflowAssert.thatSingleton(estimate).isEqualTo(3L); p.run(); }
/** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ @Test @Category(RunnableOnService.class) public void testCountWords() throws Exception { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); PCollection<String> output = input.apply(new CountWords()) .apply(ParDo.of(new FormatAsTextFn())); DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); p.run(); }
@Test @Category(RunnableOnService.class) public void withLambdaAndTypeDescriptorShouldSucceed() { PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12")); PCollection<KV<Integer, String>> kvs = values.apply( WithKeys.of((String s) -> Integer.valueOf(s)) .withKeyType(TypeDescriptor.of(Integer.class))); PAssert.that(kvs) .containsInAnyOrder( KV.of(1234, "1234"), KV.of(0, "0"), KV.of(-12, "-12"), KV.of(3210, "3210")); p.run(); }
@Test public void withLambdaAndNoTypeDescriptorShouldThrow() { PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12")); values.apply("ApplyKeysWithWithKeys", WithKeys.of((String s) -> Integer.valueOf(s))); thrown.expect(IllegalStateException.class); thrown.expectMessage("Unable to return a default Coder for ApplyKeysWithWithKeys"); thrown.expectMessage("No Coder has been manually specified"); thrown.expectMessage(containsString("Building a Coder using a registered CoderFactory failed")); thrown.expectMessage( containsString("Building a Coder from the @DefaultCoder annotation failed")); thrown.expectMessage(containsString("Building a Coder from the fallback CoderProvider failed")); p.run(); }
private void runApproximateUniqueWithDuplicates( int elementCount, int uniqueCount, int sampleSize) { assert elementCount >= uniqueCount; List<Double> elements = Lists.newArrayList(); for (int i = 0; i < elementCount; i++) { elements.add(1.0 / (i % uniqueCount + 1)); } Collections.shuffle(elements); Pipeline p = TestPipeline.create(); PCollection<Double> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
@Test public void testApproximateUniquePerKey() { List<KV<Long, Long>> elements = Lists.newArrayList(); List<Long> keys = ImmutableList.of(20L, 50L, 100L); int elementCount = 1000; int sampleSize = 100; // Use the key as the number of unique values. for (long uniqueCount : keys) { for (long value = 0; value < elementCount; value++) { elements.add(KV.of(uniqueCount, value % uniqueCount)); } } Pipeline p = TestPipeline.create(); PCollection<KV<Long, Long>> input = p.apply(Create.of(elements)); PCollection<KV<Long, Long>> counts = input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize)); DataflowAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
private void runApproximateUniqueWithSkewedDistributions( int elementCount, final int uniqueCount, final int sampleSize) { List<Integer> elements = Lists.newArrayList(); // Zipf distribution with approximately elementCount items. double s = 1 - 1.0 * uniqueCount / elementCount; double maxCount = Math.pow(uniqueCount, s); for (int k = 0; k < uniqueCount; k++) { int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s))); // Element k occurs count times. for (int c = 0; c < count; c++) { elements.add(k); } } Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
public PCollection<KV<String, Integer>> createEmptyInputTable(Pipeline p) { return p.apply( "CreateEmptyInputTable", Create.of(Arrays.asList(EMPTY_TABLE)) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))); }