@Test(dataProvider = "bases")
  public void addContextDataTest(
      List<GATKRead> reads,
      List<Variant> variantList,
      List<KV<GATKRead, ReferenceBases>> kvReadRefBases,
      List<KV<GATKRead, ReadContextData>> kvReadContextData,
      List<SimpleInterval> intervals,
      List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    PCollection<GATKRead> pReads =
        DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder());
    PCollection<KV<GATKRead, ReferenceBases>> pReadRef =
        DataflowTestUtils.pCollectionCreateAndVerify(
            p,
            kvReadRefBases,
            KvCoder.of(new GATKReadCoder(), SerializableCoder.of(ReferenceBases.class)));

    PCollection<KV<GATKRead, Iterable<Variant>>> pReadVariants =
        p.apply(
            Create.of(kvReadiVariant)
                .withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder()))));

    PCollection<KV<GATKRead, ReadContextData>> joinedResults =
        AddContextDataToRead.join(pReads, pReadRef, pReadVariants);
    PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData =
        p.apply(
            Create.of(kvReadContextData)
                .withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder())));
    DataflowTestUtils.keyReadContextDataMatcher(joinedResults, pkvReadContextData);
    p.run();
  }
  @Override
  protected void testProgram() throws Exception {

    Pipeline p = FlinkTestPipeline.create();

    PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
    PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

    PCollection<String> output = JoinExamples.joinEvents(input1, input2);

    output.apply(TextIO.Write.to(resultPath));

    p.run();
  }
  @Test
  public void testCompressedRead() throws Exception {
    String[] lines = {"Irritable eagle", "Optimistic jay", "Fanciful hawk"};
    File tmpFile = tmpFolder.newFile("test");
    String filename = tmpFile.getPath();

    List<String> expected = new ArrayList<>();
    try (PrintStream writer =
        new PrintStream(new GZIPOutputStream(new FileOutputStream(tmpFile)))) {
      for (String line : lines) {
        writer.println(line);
        expected.add(line);
      }
    }

    Pipeline p = TestPipeline.create();

    TextIO.Read.Bound<String> read =
        TextIO.Read.from(filename).withCompressionType(CompressionType.GZIP);
    PCollection<String> output = p.apply(read);

    DataflowAssert.that(output).containsInAnyOrder(expected);
    p.run();

    tmpFile.delete();
  }
  <T> void runTestRead(T[] expected, Coder<T> coder) throws Exception {
    File tmpFile = tmpFolder.newFile("file.txt");
    String filename = tmpFile.getPath();

    try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) {
      for (T elem : expected) {
        byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem);
        String line = new String(encodedElem);
        writer.println(line);
      }
    }

    Pipeline p = TestPipeline.create();

    TextIO.Read.Bound<T> read;
    if (coder.equals(StringUtf8Coder.of())) {
      TextIO.Read.Bound<String> readStrings = TextIO.Read.from(filename);
      // T==String
      read = (TextIO.Read.Bound<T>) readStrings;
    } else {
      read = TextIO.Read.from(filename).withCoder(coder);
    }

    PCollection<T> output = p.apply(read);

    DataflowAssert.that(output).containsInAnyOrder(expected);
    p.run();
  }
  @Test(dataProvider = "reads")
  public void testGATKReadCoding(final List<GATKRead> reads) {
    // The simplest way to figure out if a class is coded correctly is to create a PCollection
    // of that type and see if it matches the List version.
    final Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer
    // a coder properly in the case where the List contains a mix of different GATKRead
    // implementations.
    final PCollection<GATKRead> dataflowReads =
        p.apply(Create.of(reads).withCoder(new GATKReadCoder()));
    DataflowAssert.that(dataflowReads).containsInAnyOrder(reads);

    final PCollection<GATKRead> dataflowReadsAfterTransform =
        dataflowReads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, GATKRead>() {
                      private static final long serialVersionUID = 1l;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        c.output(c.element());
                      }
                    }))
            .setCoder(new GATKReadCoder());
    DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads);

    p.run();
  }
  /**
   * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within
   * the maximum allowed error of {@code 2/sqrt(sampleSize)}.
   */
  private static void runApproximateUniquePipeline(int sampleSize) {
    Pipeline p = TestPipeline.create();

    PCollection<String> input = p.apply(Create.of(TEST_LINES));
    PCollection<Long> approximate = input.apply(ApproximateUnique.<String>globally(sampleSize));
    final PCollectionView<Long> exact =
        input
            .apply(RemoveDuplicates.<String>create())
            .apply(Count.<String>globally())
            .apply(View.<Long>asSingleton());

    PCollection<KV<Long, Long>> approximateAndExact =
        approximate.apply(
            ParDo.of(
                    new DoFn<Long, KV<Long, Long>>() {
                      @Override
                      public void processElement(ProcessContext c) {
                        c.output(KV.of(c.element(), c.sideInput(exact)));
                      }
                    })
                .withSideInputs(exact));

    DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
Beispiel #7
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTop() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb"));
    DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c");
    DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c");
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10)));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10)));

    p.run();
  }
Beispiel #8
0
    @Override
    public PCollection<KV<URI, String>> apply(PInput input) {
      Pipeline pipeline = input.getPipeline();

      // Create one TextIO.Read transform for each document
      // and add its output to a PCollectionList
      PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

      // TextIO.Read supports:
      //  - file: URIs and paths locally
      //  - gs: URIs on the service
      for (final URI uri : uris) {
        String uriString;
        if (uri.getScheme().equals("file")) {
          uriString = new File(uri).getPath();
        } else {
          uriString = uri.toString();
        }

        PCollection<KV<URI, String>> oneUriToLines =
            pipeline
                .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")"))
                .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")"));

        urisToLines = urisToLines.and(oneUriToLines);
      }

      return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
    }
Beispiel #9
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTopEmpty() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).empty();
    DataflowAssert.thatSingletonIterable(top2).empty();
    DataflowAssert.thatSingletonIterable(top3).empty();
    DataflowAssert.that(largestPerKey).empty();
    DataflowAssert.that(smallestPerKey).empty();

    p.run();
  }
  @Test
  public void testReadNamed() {
    Pipeline p = TestPipeline.create();

    {
      PCollection<String> output1 = p.apply(TextIO.Read.from("/tmp/file.txt"));
      assertEquals("TextIO.Read.out", output1.getName());
    }

    {
      PCollection<String> output2 = p.apply(TextIO.Read.named("MyRead").from("/tmp/file.txt"));
      assertEquals("MyRead.out", output2.getName());
    }

    {
      PCollection<String> output3 = p.apply(TextIO.Read.from("/tmp/file.txt").named("HerRead"));
      assertEquals("HerRead.out", output3.getName());
    }
  }
Beispiel #11
0
  @Test
  public void testCountConstraint() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    expectedEx.expect(IllegalArgumentException.class);
    expectedEx.expectMessage(Matchers.containsString(">= 0"));

    input.apply(Top.of(-1, new OrderByLength()));
  }
  /** Recursive wildcards are not supported. This tests "**". */
  @Test
  public void testBadWildcardRecursive() throws Exception {
    Pipeline pipeline = TestPipeline.create();

    pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz"));

    // Check that running does fail.
    expectedException.expect(IllegalArgumentException.class);
    expectedException.expectMessage("wildcard");
    pipeline.run();
  }
  @Test(dataProvider = "bases")
  public void fullTest(
      List<GATKRead> reads,
      List<Variant> variantList,
      List<KV<GATKRead, ReferenceBases>> kvReadRefBases,
      List<KV<GATKRead, ReadContextData>> kvReadContextData,
      List<SimpleInterval> intervals,
      List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    PCollection<GATKRead> pReads =
        DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder());

    PCollection<Variant> pVariant = p.apply(Create.of(variantList));
    VariantsDataflowSource mockVariantsSource = mock(VariantsDataflowSource.class);

    when(mockVariantsSource.getAllVariants()).thenReturn(pVariant);

    RefAPISource mockSource = mock(RefAPISource.class, withSettings().serializable());
    for (SimpleInterval i : intervals) {
      when(mockSource.getReferenceBases(
              any(PipelineOptions.class), any(RefAPIMetadata.class), eq(i)))
          .thenReturn(FakeReferenceSource.bases(i));
    }

    String referenceName = "refName";
    String refId = "0xbjfjd23f";
    Map<String, String> referenceNameToIdTable = Maps.newHashMap();
    referenceNameToIdTable.put(referenceName, refId);
    RefAPIMetadata refAPIMetadata = new RefAPIMetadata(referenceName, referenceNameToIdTable);
    RefAPISource.setRefAPISource(mockSource);
    PCollection<KV<GATKRead, ReadContextData>> result =
        AddContextDataToRead.add(pReads, /*mockSource,*/ refAPIMetadata, mockVariantsSource);
    PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData =
        p.apply(
            Create.of(kvReadContextData)
                .withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder())));
    DataflowTestUtils.keyReadContextDataMatcher(result, pkvReadContextData);
    p.run();
  }
 /** Reads a large {@code PCollection<String>}. */
 private PCollection<String> readPCollection(Pipeline p) {
   // TODO: Read PCollection from a set of text files.
   List<String> page = TestUtils.LINES;
   final int pages = 1000;
   ArrayList<String> file = new ArrayList<>(pages * page.size());
   for (int i = 0; i < pages; i++) {
     file.addAll(page);
   }
   assert file.size() == pages * page.size();
   PCollection<String> words = p.apply(Create.of(file));
   return words;
 }
Beispiel #15
0
  // This is a purely compile-time test.  If the code compiles, then it worked.
  @Test
  public void testPerKeySerializabilityRequirement() {
    Pipeline p = TestPipeline.create();
    p.apply(
        "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator()));

    inputTable.apply(
        "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2()));
  }
Beispiel #16
0
  public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

    pipeline
        .apply(new ReadDocuments(listInputDocuments(options)))
        .apply(new ComputeTfIdf())
        .apply(new WriteTfIdf(options.getOutput()));

    pipeline.run();
  }
  @Test
  @Category(RunnableOnService.class)
  public void testApproximateUniqueWithSmallInput() {
    Pipeline p = TestPipeline.create();

    PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3)));

    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000));

    DataflowAssert.thatSingleton(estimate).isEqualTo(3L);

    p.run();
  }
  /** Recursive wildcards are not supported. This tests "**". */
  @Test
  public void testBadWildcardRecursive() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    Pipeline pipeline = DataflowPipeline.create(options);
    DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options);

    pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz"));

    // Check that translation does fail.
    thrown.expect(IllegalArgumentException.class);
    thrown.expectMessage("Unsupported wildcard usage");
    t.translate(pipeline, Collections.<DataflowPackage>emptyList());
  }
  /** Sets up and starts streaming pipeline. */
  public static void main(String[] args) {
    PubsubFileInjectorOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class);

    Pipeline pipeline = Pipeline.create(options);

    pipeline
        .apply(TextIO.Read.from(options.getInput()))
        .apply(
            IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
                .withMaxParallelism(20));

    pipeline.run();
  }
  @Test
  public void testUnsupportedFilePattern() throws IOException {
    File outFolder = tmpFolder.newFolder();
    // Windows doesn't like resolving paths with * in them.
    String filename = outFolder.toPath().resolve("output@5").toString();

    Pipeline p = TestPipeline.create();

    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of()));

    expectedException.expect(IllegalArgumentException.class);
    expectedException.expectMessage("Output name components are not allowed to contain");
    input.apply(TextIO.Write.to(filename));
  }
 /**
  * Runs the batch injector for the streaming pipeline.
  *
  * <p>The injector pipeline will read from the given text file, and inject data into the Google
  * Cloud Pub/Sub topic.
  */
 public void runInjectorPipeline(String inputFile, String topic) {
   DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
   copiedOptions.setStreaming(false);
   copiedOptions.setNumWorkers(
       options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers());
   copiedOptions.setJobName(options.getJobName() + "-injector");
   Pipeline injectorPipeline = Pipeline.create(copiedOptions);
   injectorPipeline
       .apply(TextIO.Read.from(inputFile))
       .apply(
           IntraBundleParallelization.of(PubsubFileInjector.publish(topic))
               .withMaxParallelism(20));
   DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
   jobsToCancel.add(injectorJob);
 }
  public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }
Beispiel #23
0
  @Test
  public void testTopEmptyWithIncompatibleWindows() {
    Pipeline p = TestPipeline.create();
    Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L)));
    PCollection<String> input =
        p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList()))
            .apply(windowingFn);

    expectedEx.expect(IllegalStateException.class);
    expectedEx.expectMessage("Top");
    expectedEx.expectMessage("GlobalWindows");
    expectedEx.expectMessage("withoutDefaults");
    expectedEx.expectMessage("asSingletonView");

    input.apply(Top.of(1, new OrderByLength()));
  }
  private void runApproximateUniqueWithDuplicates(
      int elementCount, int uniqueCount, int sampleSize) {

    assert elementCount >= uniqueCount;
    List<Double> elements = Lists.newArrayList();
    for (int i = 0; i < elementCount; i++) {
      elements.add(1.0 / (i % uniqueCount + 1));
    }
    Collections.shuffle(elements);

    Pipeline p = TestPipeline.create();
    PCollection<Double> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }
  <T> void runTestWrite(T[] elems, Coder<T> coder) throws Exception {
    File tmpFile = tmpFolder.newFile("file.txt");
    String filename = tmpFile.getPath();

    Pipeline p = TestPipeline.create();

    PCollection<T> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(coder));

    TextIO.Write.Bound<T> write;
    if (coder.equals(StringUtf8Coder.of())) {
      TextIO.Write.Bound<String> writeStrings = TextIO.Write.to(filename).withoutSharding();
      // T==String
      write = (TextIO.Write.Bound<T>) writeStrings;
    } else {
      write = TextIO.Write.to(filename).withCoder(coder).withoutSharding();
    }

    input.apply(write);

    p.run();

    List<String> actual = new ArrayList<>();
    try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
      for (; ; ) {
        String line = reader.readLine();
        if (line == null) {
          break;
        }
        actual.add(line);
      }
    }

    String[] expected = new String[elems.length];
    for (int i = 0; i < elems.length; i++) {
      T elem = elems[i];
      byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem);
      String line = new String(encodedElem);
      expected[i] = line;
    }

    assertThat(actual, containsInAnyOrder(expected));
  }
Beispiel #26
0
  /**
   * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted
   * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this
   * won't go well if you have too many.
   *
   * @param pipeline the pipeline to add this operation to.
   * @param reads the reads to write (they don't need to be sorted).
   * @param header the header that corresponds to the reads.
   * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS).
   * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies
   *     when writing to Hadoop
   */
  public static void writeToFile(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (BucketUtils.isHadoopUrl(destPath)
        || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) {
      writeToHadoop(pipeline, reads, header, destPath, parquet);
    } else {
      PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable());

      PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath));

      dummy.apply(
          ParDo.named("save to BAM file")
              .withSideInputs(iterableView)
              .of(new SaveToBAMFile(header, iterableView)));
    }
  }
  @Test
  public void testWriteSharded() throws IOException {
    File outFolder = tmpFolder.newFolder();
    String filename = outFolder.toPath().resolve("output").toString();

    Pipeline p = TestPipeline.create();

    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of()));

    input.apply(TextIO.Write.to(filename).withNumShards(2).withSuffix(".txt"));

    p.run();

    String[] files = outFolder.list();

    assertThat(
        Arrays.asList(files),
        containsInAnyOrder("output-00000-of-00002.txt", "output-00001-of-00002.txt"));
  }
  @Test
  public void testApproximateUniquePerKey() {
    List<KV<Long, Long>> elements = Lists.newArrayList();
    List<Long> keys = ImmutableList.of(20L, 50L, 100L);
    int elementCount = 1000;
    int sampleSize = 100;
    // Use the key as the number of unique values.
    for (long uniqueCount : keys) {
      for (long value = 0; value < elementCount; value++) {
        elements.add(KV.of(uniqueCount, value % uniqueCount));
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<KV<Long, Long>> input = p.apply(Create.of(elements));
    PCollection<KV<Long, Long>> counts =
        input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize));

    DataflowAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
  private void runApproximateUniqueWithSkewedDistributions(
      int elementCount, final int uniqueCount, final int sampleSize) {
    List<Integer> elements = Lists.newArrayList();
    // Zipf distribution with approximately elementCount items.
    double s = 1 - 1.0 * uniqueCount / elementCount;
    double maxCount = Math.pow(uniqueCount, s);
    for (int k = 0; k < uniqueCount; k++) {
      int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s)));
      // Element k occurs count times.
      for (int c = 0; c < count; c++) {
        elements.add(k);
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<Integer> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }
 private void applyRead(Pipeline pipeline, String path) {
   pipeline.apply("Read(" + path + ")", TextIO.Read.from(path));
 }