Пример #1
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTopEmpty() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).empty();
    DataflowAssert.thatSingletonIterable(top2).empty();
    DataflowAssert.thatSingletonIterable(top3).empty();
    DataflowAssert.that(largestPerKey).empty();
    DataflowAssert.that(smallestPerKey).empty();

    p.run();
  }
  /**
   * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within
   * the maximum allowed error of {@code 2/sqrt(sampleSize)}.
   */
  private void runApproximateUniquePipeline(int sampleSize) {
    Pipeline p = TestPipeline.create();
    PCollection<String> collection = readPCollection(p);

    final PCollectionView<Long> exact =
        collection
            .apply(RemoveDuplicates.<String>create())
            .apply(Combine.globally(new CountElements<String>()))
            .apply(View.<Long>asSingleton());

    PCollection<Long> approximate =
        collection.apply(ApproximateUnique.<String>globally(sampleSize));

    PCollection<KV<Long, Long>> approximateAndExact =
        approximate.apply(
            ParDo.of(
                    new DoFn<Long, KV<Long, Long>>() {
                      @Override
                      public void processElement(ProcessContext c) {
                        c.output(KV.of(c.element(), c.sideInput(exact)));
                      }
                    })
                .withSideInputs(exact));

    DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
Пример #3
0
  @Test
  @SuppressWarnings("unchecked")
  public void testTop() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb"));
    DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c");
    DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c");
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10)));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10)));

    p.run();
  }
  @Test(dataProvider = "reads")
  public void testGATKReadCoding(final List<GATKRead> reads) {
    // The simplest way to figure out if a class is coded correctly is to create a PCollection
    // of that type and see if it matches the List version.
    final Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer
    // a coder properly in the case where the List contains a mix of different GATKRead
    // implementations.
    final PCollection<GATKRead> dataflowReads =
        p.apply(Create.of(reads).withCoder(new GATKReadCoder()));
    DataflowAssert.that(dataflowReads).containsInAnyOrder(reads);

    final PCollection<GATKRead> dataflowReadsAfterTransform =
        dataflowReads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, GATKRead>() {
                      private static final long serialVersionUID = 1l;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        c.output(c.element());
                      }
                    }))
            .setCoder(new GATKReadCoder());
    DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads);

    p.run();
  }
  /**
   * Merge the statistics from each block. The resulting "collection" contains a single element,
   * with the answer.
   */
  private static PCollection<RecalibrationTables> aggregateStatistics(
      final PCollection<RecalibrationTables> tables) {
    return tables
        // aggregate
        .apply(Combine.globally(new RecalibrationTablesMerger()))
        // call finalize on the result
        .apply(
            ParDo.named("finalizeRecalTables")
                .of(
                    new DoFnWLog<RecalibrationTables, RecalibrationTables>("finalizeRecalTables") {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        RecalibrationTables tables = c.element();
                        if (null == tables) {
                          // the merger may return null when there are no inputs at all. In that
                          // case we don't want to
                          // crash (though it's really an edge case).
                          log.warn("No recalibration tables!");
                        } else {
                          // normal case: recalibrate
                          BaseRecalibrationEngine.finalizeRecalibrationTables(tables);
                        }
                        c.output(tables);
                      }
                    }));
  }
Пример #6
0
 @Override
 public PCollectionView<Iterable<T>> apply(PCollection<T> input) {
   return input.apply(
       CreatePCollectionView.<T, Iterable<T>>of(
           PCollectionViews.iterableView(
               input.getPipeline(), input.getWindowingStrategy(), input.getCoder())));
 }
    @Override
    public PDone apply(PCollection<Integer> input) {
      // Apply an operation so that this is a composite transform.
      input.apply(Count.<Integer>perElement());

      return PDone.in(input.getPipeline());
    }
  @Test
  public void testMultiGraphPipelineSerialization() throws IOException {
    Pipeline p = DataflowPipeline.create(buildPipelineOptions());

    PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));

    input.apply(new UnrelatedOutputCreator());
    input.apply(new UnboundOutputCreator());

    DataflowPipelineTranslator t =
        DataflowPipelineTranslator.fromOptions(
            PipelineOptionsFactory.as(DataflowPipelineOptions.class));

    // Check that translation doesn't fail.
    t.translate(p, Collections.<DataflowPackage>emptyList());
  }
    @Override
    public PCollection<Integer> apply(PCollection<Integer> input) {
      // Apply an operation so that this is a composite transform.
      input.apply(Count.<Integer>perElement());

      // Return a value unrelated to the input.
      return input.getPipeline().apply(Create.of(1, 2, 3, 4));
    }
 @Override
 public PCollection<RecalibrationTables> apply(
     PCollection<AddContextDataToReadOptimized.ContextShard> input) {
   PCollection<RecalibrationTables> oneStatPerWorker =
       input.apply(
           ParDo.named("BaseRecalibrator")
               .withSideInputs(headerView, refDictionary)
               .of(new BaseRecalibratorOptimizedFn(headerView, refDictionary, recalArgs)));
   return aggregateStatistics(oneStatPerWorker);
 }
  @Test
  public void testPartiallyBoundFailure() throws IOException {
    Pipeline p = DataflowPipeline.create(buildPipelineOptions());

    PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));

    thrown.expect(IllegalStateException.class);
    input.apply(new PartiallyBoundOutputCreator());

    Assert.fail("Failure expected from use of partially bound output");
  }
Пример #12
0
 @Override
 public PCollectionView<T> apply(PCollection<T> input) {
   return input.apply(
       CreatePCollectionView.<T, T>of(
           PCollectionViews.singletonView(
               input.getPipeline(),
               input.getWindowingStrategy(),
               hasDefault,
               defaultValue,
               input.getCoder())));
 }
Пример #13
0
  @Test
  public void testCountConstraint() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    expectedEx.expect(IllegalArgumentException.class);
    expectedEx.expectMessage(Matchers.containsString(">= 0"));

    input.apply(Top.of(-1, new OrderByLength()));
  }
    @Override
    public PCollectionTuple apply(PCollection<Integer> input) {
      PCollection<Integer> sum = input.apply(Sum.integersGlobally());

      // Fails here when attempting to construct a tuple with an unbound object.
      return PCollectionTuple.of(sumTag, sum)
          .and(
              doneTag,
              PCollection.<Void>createPrimitiveOutputInternal(
                  input.getPipeline(), WindowingStrategy.globalDefault(), input.isBounded()));
    }
Пример #15
0
  /**
   * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted
   * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this
   * won't go well if you have too many.
   *
   * @param pipeline the pipeline to add this operation to.
   * @param reads the reads to write (they don't need to be sorted).
   * @param header the header that corresponds to the reads.
   * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS).
   * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies
   *     when writing to Hadoop
   */
  public static void writeToFile(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (BucketUtils.isHadoopUrl(destPath)
        || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) {
      writeToHadoop(pipeline, reads, header, destPath, parquet);
    } else {
      PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable());

      PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath));

      dummy.apply(
          ParDo.named("save to BAM file")
              .withSideInputs(iterableView)
              .of(new SaveToBAMFile(header, iterableView)));
    }
  }
  @Override
  protected void setupPipeline(Pipeline pipeline) {
    // Load the reads.
    final ReadsDataflowSource readsDataflowSource = new ReadsDataflowSource(bam, pipeline);
    final SAMFileHeader readsHeader = readsDataflowSource.getHeader();
    final List<SimpleInterval> intervals =
        intervalArgumentCollection.intervalsSpecified()
            ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary())
            : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary());

    final PCollectionView<SAMFileHeader> headerSingleton =
        ReadsDataflowSource.getHeaderView(pipeline, readsHeader);
    final PCollection<GATKRead> initialReads = readsDataflowSource.getReadPCollection(intervals);

    // Apply MarkDuplicates to produce updated GATKReads.
    final PCollection<GATKRead> markedReads =
        initialReads.apply(new MarkDuplicates(headerSingleton));

    // Load the Variants and the Reference and join them to reads.
    final VariantsDataflowSource variantsDataflowSource =
        new VariantsDataflowSource(baseRecalibrationKnownVariants, pipeline);

    Map<String, String> referenceNameToIdTable =
        RefAPISource.buildReferenceNameToIdTable(pipeline.getOptions(), referenceName);
    RefAPIMetadata refAPIMetadata = new RefAPIMetadata(referenceName, referenceNameToIdTable);

    final PCollection<KV<GATKRead, ReadContextData>> readsWithContext =
        AddContextDataToRead.add(markedReads, refAPIMetadata, variantsDataflowSource);

    // Apply BQSR.
    final PCollection<RecalibrationTables> recalibrationReports =
        readsWithContext.apply(new BaseRecalibratorStub(headerSingleton));
    final PCollectionView<RecalibrationTables> mergedRecalibrationReport =
        recalibrationReports.apply(View.<RecalibrationTables>asSingleton());

    final PCollection<GATKRead> finalReads =
        markedReads.apply(new ApplyBQSRStub(headerSingleton, mergedRecalibrationReport));
    SmallBamWriter.writeToFile(pipeline, finalReads, readsHeader, output);
  }
  @Test
  @Category(RunnableOnService.class)
  public void testApproximateUniqueWithSmallInput() {
    Pipeline p = TestPipeline.create();

    PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3)));

    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000));

    DataflowAssert.thatSingleton(estimate).isEqualTo(3L);

    p.run();
  }
Пример #18
0
  @Override
  protected void testProgram() throws Exception {

    Pipeline p = FlinkTestPipeline.create();

    PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
    PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

    PCollection<String> output = JoinExamples.joinEvents(input1, input2);

    output.apply(TextIO.Write.to(resultPath));

    p.run();
  }
    @Override
    public PCollection<GATKRead> apply(PCollection<GATKRead> input) {
      return input.apply(
          ParDo.named("ApplyBQSR")
              .of(
                  new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(header, recalibrationReport));
    }
Пример #20
0
  @Test
  public void testUnsupportedFilePattern() throws IOException {
    File outFolder = tmpFolder.newFolder();
    // Windows doesn't like resolving paths with * in them.
    String filename = outFolder.toPath().resolve("output@5").toString();

    Pipeline p = TestPipeline.create();

    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of()));

    expectedException.expect(IllegalArgumentException.class);
    expectedException.expectMessage("Output name components are not allowed to contain");
    input.apply(TextIO.Write.to(filename));
  }
  public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }
Пример #22
0
  @Test
  public void testTopEmptyWithIncompatibleWindows() {
    Pipeline p = TestPipeline.create();
    Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L)));
    PCollection<String> input =
        p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList()))
            .apply(windowingFn);

    expectedEx.expect(IllegalStateException.class);
    expectedEx.expectMessage("Top");
    expectedEx.expectMessage("GlobalWindows");
    expectedEx.expectMessage("withoutDefaults");
    expectedEx.expectMessage("asSingletonView");

    input.apply(Top.of(1, new OrderByLength()));
  }
Пример #23
0
  private static void writeToHadoop(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (destPath.equals("/dev/null")) {
      return;
    }

    String headerString =
        Base64.getEncoder().encodeToString(SerializableUtils.serializeToByteArray(header));

    @SuppressWarnings("unchecked")
    Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>> outputFormatClass =
        (Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>>)
            (Class<?>) TemplatedKeyIgnoringBAMOutputFormat.class;
    @SuppressWarnings("unchecked")
    HadoopIO.Write.Bound<NullWritable, SAMRecordWritable> write =
        HadoopIO.Write.to(destPath, outputFormatClass, NullWritable.class, SAMRecordWritable.class)
            .withConfigurationProperty(
                TemplatedKeyIgnoringBAMOutputFormat.SAM_HEADER_PROPERTY_NAME, headerString);

    PCollection<KV<NullWritable, SAMRecordWritable>> samReads =
        reads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, KV<NullWritable, SAMRecordWritable>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        SAMRecord samRecord = c.element().convertToSAMRecord(header);
                        SAMRecordWritable samRecordWritable = new SAMRecordWritable();
                        samRecordWritable.set(samRecord);
                        c.output(KV.of(NullWritable.get(), samRecordWritable));
                      }
                    }))
            .setCoder(
                KvCoder.of(
                    WritableCoder.of(NullWritable.class),
                    WritableCoder.of(SAMRecordWritable.class)));

    // write as a single (unsharded) file
    samReads.apply(write.withoutSharding());
  }
  private void runApproximateUniqueWithDuplicates(
      int elementCount, int uniqueCount, int sampleSize) {

    assert elementCount >= uniqueCount;
    List<Double> elements = Lists.newArrayList();
    for (int i = 0; i < elementCount; i++) {
      elements.add(1.0 / (i % uniqueCount + 1));
    }
    Collections.shuffle(elements);

    Pipeline p = TestPipeline.create();
    PCollection<Double> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }
  /**
   * addQuantizationInfo takes the computed RecalibrationTable and adds the QuantizationInfo and
   * RequestedCovariates objects. We call this triplet "BaseRecalOutput". It contains everything we
   * need from phase 1 to continue onto phase 2 of BQSR.
   */
  private static PCollection<BaseRecalOutput> addQuantizationInfo(
      PCollectionView<SAMFileHeader> headerView,
      RecalibrationArgumentCollection recalArgs,
      PCollection<RecalibrationTables> recal) {
    return recal.apply(
        ParDo.named("addQuantizationInfo")
            .withSideInputs(headerView)
            .of(
                new DoFnWLog<RecalibrationTables, BaseRecalOutput>("addQuantizationInfo") {
                  private static final long serialVersionUID = 1L;

                  @Override
                  public void processElement(ProcessContext c) throws IOException {
                    RecalibrationTables rt = c.element();
                    SAMFileHeader header = c.sideInput(headerView);
                    // BaseRecalOutput ret = new BaseRecalOutput(rt,
                    // baseRecalibratorWorker.getQuantizationInfo(rt),
                    // baseRecalibratorWorker.getRequestedCovariates());
                    // Saving and loading back the report actually changes it. So we have to do it.
                    // TODO(issue#799): Figure out what it changes, and just do that instead of
                    // doing the whole rigamarole.
                    File temp = IOUtils.createTempFile("temp-recalibrationtable-", ".tmp");
                    if (null == rt) {
                      // special case where we have zero reads in the input. Create a valid empty
                      // report.
                      log.debug("Special case: zero reads in input.");
                      BaseRecalibrationEngine recalibrationEngine =
                          new BaseRecalibrationEngine(recalArgs, header);
                      rt = recalibrationEngine.getRecalibrationTables();
                      BaseRecalibrationEngine.finalizeRecalibrationTables(rt);
                    }
                    try {
                      BaseRecalibratorOptimizedFn.saveTextualReport(temp, header, rt, recalArgs);
                      BaseRecalOutput ret = new BaseRecalOutput(temp);
                      c.output(ret);
                    } catch (FileNotFoundException e) {
                      throw new GATKException("can't find my own temporary file", e);
                    } catch (IOException e) {
                      throw new GATKException(
                          "unable to save temporary report to " + temp.getPath(), e);
                    }
                  }
                }));
  }
Пример #26
0
      @Override
      public PDone apply(PCollection<T> input) {
        if (filenamePrefix == null) {
          throw new IllegalStateException(
              "need to set the filename prefix of an AvroIO.Write transform");
        }
        if (schema == null) {
          throw new IllegalStateException("need to set the schema of an AvroIO.Write transform");
        }

        // Note that custom sinks currently do not expose sharding controls.
        // Thus pipeline runner writers need to individually add support internally to
        // apply user requested sharding limits.
        return input.apply(
            "Write",
            com.google.cloud.dataflow.sdk.io.Write.to(
                new AvroSink<>(
                    filenamePrefix, filenameSuffix, shardTemplate, AvroCoder.of(type, schema))));
      }
Пример #27
0
  <T> void runTestWrite(T[] elems, Coder<T> coder) throws Exception {
    File tmpFile = tmpFolder.newFile("file.txt");
    String filename = tmpFile.getPath();

    Pipeline p = TestPipeline.create();

    PCollection<T> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(coder));

    TextIO.Write.Bound<T> write;
    if (coder.equals(StringUtf8Coder.of())) {
      TextIO.Write.Bound<String> writeStrings = TextIO.Write.to(filename).withoutSharding();
      // T==String
      write = (TextIO.Write.Bound<T>) writeStrings;
    } else {
      write = TextIO.Write.to(filename).withCoder(coder).withoutSharding();
    }

    input.apply(write);

    p.run();

    List<String> actual = new ArrayList<>();
    try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
      for (; ; ) {
        String line = reader.readLine();
        if (line == null) {
          break;
        }
        actual.add(line);
      }
    }

    String[] expected = new String[elems.length];
    for (int i = 0; i < elems.length; i++) {
      T elem = elems[i];
      byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem);
      String line = new String(encodedElem);
      expected[i] = line;
    }

    assertThat(actual, containsInAnyOrder(expected));
  }
Пример #28
0
  @Test
  public void testWriteSharded() throws IOException {
    File outFolder = tmpFolder.newFolder();
    String filename = outFolder.toPath().resolve("output").toString();

    Pipeline p = TestPipeline.create();

    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of()));

    input.apply(TextIO.Write.to(filename).withNumShards(2).withSuffix(".txt"));

    p.run();

    String[] files = outFolder.list();

    assertThat(
        Arrays.asList(files),
        containsInAnyOrder("output-00000-of-00002.txt", "output-00001-of-00002.txt"));
  }
Пример #29
0
  @Override
  public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) {
    PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard =
        input.apply(
            ParDo.of(
                    new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        // Apply our reference window function to each read before deciding which
                        // reference shard it belongs to.
                        ReferenceShard shard =
                            ReferenceShard.getShardNumberFromInterval(
                                referenceWindowFunction.apply(c.element()));
                        c.output(KV.of(shard, c.element()));
                      }
                    })
                .named("KeyReadByRefShard"));
    return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create());
  }
  private void runApproximateUniqueWithSkewedDistributions(
      int elementCount, final int uniqueCount, final int sampleSize) {
    List<Integer> elements = Lists.newArrayList();
    // Zipf distribution with approximately elementCount items.
    double s = 1 - 1.0 * uniqueCount / elementCount;
    double maxCount = Math.pow(uniqueCount, s);
    for (int k = 0; k < uniqueCount; k++) {
      int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s)));
      // Element k occurs count times.
      for (int c = 0; c < count; c++) {
        elements.add(k);
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<Integer> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }