/**
   * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within
   * the maximum allowed error of {@code 2/sqrt(sampleSize)}.
   */
  private static void runApproximateUniquePipeline(int sampleSize) {
    Pipeline p = TestPipeline.create();

    PCollection<String> input = p.apply(Create.of(TEST_LINES));
    PCollection<Long> approximate = input.apply(ApproximateUnique.<String>globally(sampleSize));
    final PCollectionView<Long> exact =
        input
            .apply(RemoveDuplicates.<String>create())
            .apply(Count.<String>globally())
            .apply(View.<Long>asSingleton());

    PCollection<KV<Long, Long>> approximateAndExact =
        approximate.apply(
            ParDo.of(
                    new DoFn<Long, KV<Long, Long>>() {
                      @Override
                      public void processElement(ProcessContext c) {
                        c.output(KV.of(c.element(), c.sideInput(exact)));
                      }
                    })
                .withSideInputs(exact));

    DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
Exemplo n.º 2
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTopEmpty() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).empty();
    DataflowAssert.thatSingletonIterable(top2).empty();
    DataflowAssert.thatSingletonIterable(top3).empty();
    DataflowAssert.that(largestPerKey).empty();
    DataflowAssert.that(smallestPerKey).empty();

    p.run();
  }
Exemplo n.º 3
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTop() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb"));
    DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c");
    DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c");
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10)));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10)));

    p.run();
  }
Exemplo n.º 4
0
  @Test
  public void testCountConstraint() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    expectedEx.expect(IllegalArgumentException.class);
    expectedEx.expectMessage(Matchers.containsString(">= 0"));

    input.apply(Top.of(-1, new OrderByLength()));
  }
 /** Reads a large {@code PCollection<String>}. */
 private PCollection<String> readPCollection(Pipeline p) {
   // TODO: Read PCollection from a set of text files.
   List<String> page = TestUtils.LINES;
   final int pages = 1000;
   ArrayList<String> file = new ArrayList<>(pages * page.size());
   for (int i = 0; i < pages; i++) {
     file.addAll(page);
   }
   assert file.size() == pages * page.size();
   PCollection<String> words = p.apply(Create.of(file));
   return words;
 }
Exemplo n.º 6
0
  // This is a purely compile-time test.  If the code compiles, then it worked.
  @Test
  public void testPerKeySerializabilityRequirement() {
    Pipeline p = TestPipeline.create();
    p.apply(
        "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator()));

    inputTable.apply(
        "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2()));
  }
  @Test
  @Category(RunnableOnService.class)
  public void testApproximateUniqueWithSmallInput() {
    Pipeline p = TestPipeline.create();

    PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3)));

    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000));

    DataflowAssert.thatSingleton(estimate).isEqualTo(3L);

    p.run();
  }
Exemplo n.º 8
0
  /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
  @Test
  @Category(RunnableOnService.class)
  public void testCountWords() throws Exception {
    Pipeline p = TestPipeline.create();

    PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));

    PCollection<String> output = input.apply(new CountWords())
      .apply(ParDo.of(new FormatAsTextFn()));

    DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
    p.run();
  }
  @Test
  @Category(RunnableOnService.class)
  public void withLambdaAndTypeDescriptorShouldSucceed() {

    PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12"));
    PCollection<KV<Integer, String>> kvs =
        values.apply(
            WithKeys.of((String s) -> Integer.valueOf(s))
                .withKeyType(TypeDescriptor.of(Integer.class)));

    PAssert.that(kvs)
        .containsInAnyOrder(
            KV.of(1234, "1234"), KV.of(0, "0"), KV.of(-12, "-12"), KV.of(3210, "3210"));

    p.run();
  }
  @Test
  public void withLambdaAndNoTypeDescriptorShouldThrow() {

    PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12"));

    values.apply("ApplyKeysWithWithKeys", WithKeys.of((String s) -> Integer.valueOf(s)));

    thrown.expect(IllegalStateException.class);
    thrown.expectMessage("Unable to return a default Coder for ApplyKeysWithWithKeys");
    thrown.expectMessage("No Coder has been manually specified");
    thrown.expectMessage(containsString("Building a Coder using a registered CoderFactory failed"));
    thrown.expectMessage(
        containsString("Building a Coder from the @DefaultCoder annotation failed"));
    thrown.expectMessage(containsString("Building a Coder from the fallback CoderProvider failed"));

    p.run();
  }
  private void runApproximateUniqueWithDuplicates(
      int elementCount, int uniqueCount, int sampleSize) {

    assert elementCount >= uniqueCount;
    List<Double> elements = Lists.newArrayList();
    for (int i = 0; i < elementCount; i++) {
      elements.add(1.0 / (i % uniqueCount + 1));
    }
    Collections.shuffle(elements);

    Pipeline p = TestPipeline.create();
    PCollection<Double> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }
  @Test
  public void testApproximateUniquePerKey() {
    List<KV<Long, Long>> elements = Lists.newArrayList();
    List<Long> keys = ImmutableList.of(20L, 50L, 100L);
    int elementCount = 1000;
    int sampleSize = 100;
    // Use the key as the number of unique values.
    for (long uniqueCount : keys) {
      for (long value = 0; value < elementCount; value++) {
        elements.add(KV.of(uniqueCount, value % uniqueCount));
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<KV<Long, Long>> input = p.apply(Create.of(elements));
    PCollection<KV<Long, Long>> counts =
        input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize));

    DataflowAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
  private void runApproximateUniqueWithSkewedDistributions(
      int elementCount, final int uniqueCount, final int sampleSize) {
    List<Integer> elements = Lists.newArrayList();
    // Zipf distribution with approximately elementCount items.
    double s = 1 - 1.0 * uniqueCount / elementCount;
    double maxCount = Math.pow(uniqueCount, s);
    for (int k = 0; k < uniqueCount; k++) {
      int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s)));
      // Element k occurs count times.
      for (int c = 0; c < count; c++) {
        elements.add(k);
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<Integer> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }
Exemplo n.º 14
0
 public PCollection<KV<String, Integer>> createEmptyInputTable(Pipeline p) {
   return p.apply(
       "CreateEmptyInputTable",
       Create.of(Arrays.asList(EMPTY_TABLE))
           .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of())));
 }