Esempi in Java per PCollection, esempi in Java per com.google.cloud.dataflow.sdk.values.PCollection

Esempio n. 1

0

Mostra file

File: TopTest.java Progetto: cdma/DataflowJavaSDK

  @Test
  @SuppressWarnings("unchecked")
  public void testTop() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb"));
    DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c");
    DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c");
    DataflowAssert.that(largestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10)));
    DataflowAssert.that(smallestPerKey)
        .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10)));

    p.run();
  }

Esempio n. 2

0

Mostra file

File: GATKReadCoderUnitTest.java Progetto: nenewell/hellbender

  @Test(dataProvider = "reads")
  public void testGATKReadCoding(final List<GATKRead> reads) {
    // The simplest way to figure out if a class is coded correctly is to create a PCollection
    // of that type and see if it matches the List version.
    final Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer
    // a coder properly in the case where the List contains a mix of different GATKRead
    // implementations.
    final PCollection<GATKRead> dataflowReads =
        p.apply(Create.of(reads).withCoder(new GATKReadCoder()));
    DataflowAssert.that(dataflowReads).containsInAnyOrder(reads);

    final PCollection<GATKRead> dataflowReadsAfterTransform =
        dataflowReads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, GATKRead>() {
                      private static final long serialVersionUID = 1l;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        c.output(c.element());
                      }
                    }))
            .setCoder(new GATKReadCoder());
    DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads);

    p.run();
  }

Esempio n. 3

0

Mostra file

File: TopTest.java Progetto: cdma/DataflowJavaSDK

  @Test
  @SuppressWarnings("unchecked")
  public void testTopEmpty() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of()));

    PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength()));
    PCollection<List<String>> top2 = input.apply(Top.<String>largest(2));
    PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3));

    PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p);
    PCollection<KV<String, List<Integer>>> largestPerKey =
        inputTable.apply(Top.<String, Integer>largestPerKey(2));
    PCollection<KV<String, List<Integer>>> smallestPerKey =
        inputTable.apply(Top.<String, Integer>smallestPerKey(2));

    DataflowAssert.thatSingletonIterable(top1).empty();
    DataflowAssert.thatSingletonIterable(top2).empty();
    DataflowAssert.thatSingletonIterable(top3).empty();
    DataflowAssert.that(largestPerKey).empty();
    DataflowAssert.that(smallestPerKey).empty();

    p.run();
  }

Esempio n. 4

0

Mostra file

File: DataflowPipelineTranslatorTest.java Progetto: Jason-CloudTP/DataflowJavaSDK

    @Override
    public PDone apply(PCollection<Integer> input) {
      // Apply an operation so that this is a composite transform.
      input.apply(Count.<Integer>perElement());

      return PDone.in(input.getPipeline());
    }

Esempio n. 5

0

Mostra file

File: View.java Progetto: rcongiu/DataflowJavaSDK

 @Override
 public PCollectionView<Iterable<T>> apply(PCollection<T> input) {
   return input.apply(
       CreatePCollectionView.<T, Iterable<T>>of(
           PCollectionViews.iterableView(
               input.getPipeline(), input.getWindowingStrategy(), input.getCoder())));
 }

Esempio n. 6

0

Mostra file

File: AvroIO.java Progetto: cdma/DataflowJavaSDK

      @Override
      public PCollection<T> apply(PInput input) {
        if (filepattern == null) {
          throw new IllegalStateException(
              "need to set the filepattern of an AvroIO.Read transform");
        }
        if (schema == null) {
          throw new IllegalStateException("need to set the schema of an AvroIO.Read transform");
        }
        if (validate) {
          try {
            checkState(
                !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(),
                "Unable to find any files matching %s",
                filepattern);
          } catch (IOException e) {
            throw new IllegalStateException(String.format("Failed to validate %s", filepattern), e);
          }
        }

        @SuppressWarnings("unchecked")
        Bounded<T> read =
            type == GenericRecord.class
                ? (Bounded<T>)
                    com.google.cloud.dataflow.sdk.io.Read.from(
                        AvroSource.from(filepattern).withSchema(schema))
                : com.google.cloud.dataflow.sdk.io.Read.from(
                    AvroSource.from(filepattern).withSchema(type));

        PCollection<T> pcol = input.getPipeline().apply("Read", read);
        // Honor the default output coder that would have been used by this PTransform.
        pcol.setCoder(getDefaultOutputCoder());
        return pcol;
      }

Esempio n. 7

0

Mostra file

File: ApproximateUniqueTest.java Progetto: latindignity/DataflowJavaSDK

  /**
   * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within
   * the maximum allowed error of {@code 2/sqrt(sampleSize)}.
   */
  private void runApproximateUniquePipeline(int sampleSize) {
    Pipeline p = TestPipeline.create();
    PCollection<String> collection = readPCollection(p);

    final PCollectionView<Long> exact =
        collection
            .apply(RemoveDuplicates.<String>create())
            .apply(Combine.globally(new CountElements<String>()))
            .apply(View.<Long>asSingleton());

    PCollection<Long> approximate =
        collection.apply(ApproximateUnique.<String>globally(sampleSize));

    PCollection<KV<Long, Long>> approximateAndExact =
        approximate.apply(
            ParDo.of(
                    new DoFn<Long, KV<Long, Long>>() {
                      @Override
                      public void processElement(ProcessContext c) {
                        c.output(KV.of(c.element(), c.sideInput(exact)));
                      }
                    })
                .withSideInputs(exact));

    DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }

Esempio n. 8

0

Mostra file

File: DataflowPipelineTranslatorTest.java Progetto: Jason-CloudTP/DataflowJavaSDK

    @Override
    public PCollection<Integer> apply(PCollection<Integer> input) {
      // Apply an operation so that this is a composite transform.
      input.apply(Count.<Integer>perElement());

      // Return a value unrelated to the input.
      return input.getPipeline().apply(Create.of(1, 2, 3, 4));
    }

Esempio n. 9

0

Mostra file

File: DataflowPipelineTranslatorTest.java Progetto: Jason-CloudTP/DataflowJavaSDK

    @Override
    public PCollectionTuple apply(PCollection<Integer> input) {
      PCollection<Integer> sum = input.apply(Sum.integersGlobally());

      // Fails here when attempting to construct a tuple with an unbound object.
      return PCollectionTuple.of(sumTag, sum)
          .and(
              doneTag,
              PCollection.<Void>createPrimitiveOutputInternal(
                  input.getPipeline(), WindowingStrategy.globalDefault(), input.isBounded()));
    }

Esempio n. 10

0

Mostra file

File: DataflowPipelineTranslatorTest.java Progetto: Jason-CloudTP/DataflowJavaSDK

  @Test
  public void testPartiallyBoundFailure() throws IOException {
    Pipeline p = DataflowPipeline.create(buildPipelineOptions());

    PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));

    thrown.expect(IllegalStateException.class);
    input.apply(new PartiallyBoundOutputCreator());

    Assert.fail("Failure expected from use of partially bound output");
  }

Esempio n. 11

0

Mostra file

File: View.java Progetto: rcongiu/DataflowJavaSDK

 @Override
 public PCollectionView<T> apply(PCollection<T> input) {
   return input.apply(
       CreatePCollectionView.<T, T>of(
           PCollectionViews.singletonView(
               input.getPipeline(),
               input.getWindowingStrategy(),
               hasDefault,
               defaultValue,
               input.getCoder())));
 }

Esempio n. 12

0

Mostra file

File: TopTest.java Progetto: cdma/DataflowJavaSDK

  @Test
  public void testCountConstraint() {
    Pipeline p = TestPipeline.create();
    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of()));

    expectedEx.expect(IllegalArgumentException.class);
    expectedEx.expectMessage(Matchers.containsString(">= 0"));

    input.apply(Top.of(-1, new OrderByLength()));
  }

Esempio n. 13

0

Mostra file

File: ApproximateUniqueTest.java Progetto: latindignity/DataflowJavaSDK

  @Test
  @Category(RunnableOnService.class)
  public void testApproximateUniqueWithSmallInput() {
    Pipeline p = TestPipeline.create();

    PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3)));

    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000));

    DataflowAssert.thatSingleton(estimate).isEqualTo(3L);

    p.run();
  }

Esempio n. 14

0

Mostra file

File: JoinExamplesITCase.java Progetto: liaobs/flink-dataflow

  @Override
  protected void testProgram() throws Exception {

    Pipeline p = FlinkTestPipeline.create();

    PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
    PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

    PCollection<String> output = JoinExamples.joinEvents(input1, input2);

    output.apply(TextIO.Write.to(resultPath));

    p.run();
  }

Esempio n. 15

0

Mostra file

File: TextIOTest.java Progetto: latindignity/DataflowJavaSDK

  @Test
  public void testUnsupportedFilePattern() throws IOException {
    File outFolder = tmpFolder.newFolder();
    // Windows doesn't like resolving paths with * in them.
    String filename = outFolder.toPath().resolve("output@5").toString();

    Pipeline p = TestPipeline.create();

    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of()));

    expectedException.expect(IllegalArgumentException.class);
    expectedException.expectMessage("Output name components are not allowed to contain");
    input.apply(TextIO.Write.to(filename));
  }

Esempio n. 16

0

Mostra file

File: KafkaWindowedWordCountExample.java Progetto: Ravi-Macha/flink-dataflow

  public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }

Esempio n. 17

0

Mostra file

File: DataflowPipelineTranslatorTest.java Progetto: Jason-CloudTP/DataflowJavaSDK

  @Test
  public void testMultiGraphPipelineSerialization() throws IOException {
    Pipeline p = DataflowPipeline.create(buildPipelineOptions());

    PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));

    input.apply(new UnrelatedOutputCreator());
    input.apply(new UnboundOutputCreator());

    DataflowPipelineTranslator t =
        DataflowPipelineTranslator.fromOptions(
            PipelineOptionsFactory.as(DataflowPipelineOptions.class));

    // Check that translation doesn't fail.
    t.translate(p, Collections.<DataflowPackage>emptyList());
  }

Esempio n. 18

0

Mostra file

File: BaseRecalibratorOptimizedTransform.java Progetto: broadinstitute/gatk-dataflow

  /**
   * Merge the statistics from each block. The resulting "collection" contains a single element,
   * with the answer.
   */
  private static PCollection<RecalibrationTables> aggregateStatistics(
      final PCollection<RecalibrationTables> tables) {
    return tables
        // aggregate
        .apply(Combine.globally(new RecalibrationTablesMerger()))
        // call finalize on the result
        .apply(
            ParDo.named("finalizeRecalTables")
                .of(
                    new DoFnWLog<RecalibrationTables, RecalibrationTables>("finalizeRecalTables") {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        RecalibrationTables tables = c.element();
                        if (null == tables) {
                          // the merger may return null when there are no inputs at all. In that
                          // case we don't want to
                          // crash (though it's really an edge case).
                          log.warn("No recalibration tables!");
                        } else {
                          // normal case: recalibrate
                          BaseRecalibrationEngine.finalizeRecalibrationTables(tables);
                        }
                        c.output(tables);
                      }
                    }));
  }

Esempio n. 19

0

Mostra file

File: PubsubIO.java Progetto: ssesha/DataflowJavaSDK

 @Override
 public PDone apply(PCollection<T> input) {
   if (topic == null) {
     throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform");
   }
   return PDone.in(input.getPipeline());
 }

Esempio n. 20

0

Mostra file

File: TopTest.java Progetto: cdma/DataflowJavaSDK

  @Test
  public void testTopEmptyWithIncompatibleWindows() {
    Pipeline p = TestPipeline.create();
    Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L)));
    PCollection<String> input =
        p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList()))
            .apply(windowingFn);

    expectedEx.expect(IllegalStateException.class);
    expectedEx.expectMessage("Top");
    expectedEx.expectMessage("GlobalWindows");
    expectedEx.expectMessage("withoutDefaults");
    expectedEx.expectMessage("asSingletonView");

    input.apply(Top.of(1, new OrderByLength()));
  }

Esempio n. 21

0

Mostra file

File: ApproximateUniqueTest.java Progetto: latindignity/DataflowJavaSDK

  private void runApproximateUniqueWithDuplicates(
      int elementCount, int uniqueCount, int sampleSize) {

    assert elementCount >= uniqueCount;
    List<Double> elements = Lists.newArrayList();
    for (int i = 0; i < elementCount; i++) {
      elements.add(1.0 / (i % uniqueCount + 1));
    }
    Collections.shuffle(elements);

    Pipeline p = TestPipeline.create();
    PCollection<Double> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }

Esempio n. 22

0

Mostra file

File: BaseRecalibratorOptimizedTransform.java Progetto: broadinstitute/gatk-dataflow

 @Override
 public PCollection<RecalibrationTables> apply(
     PCollection<AddContextDataToReadOptimized.ContextShard> input) {
   PCollection<RecalibrationTables> oneStatPerWorker =
       input.apply(
           ParDo.named("BaseRecalibrator")
               .withSideInputs(headerView, refDictionary)
               .of(new BaseRecalibratorOptimizedFn(headerView, refDictionary, recalArgs)));
   return aggregateStatistics(oneStatPerWorker);
 }

Esempio n. 23

0

Mostra file

File: TextIOTest.java Progetto: latindignity/DataflowJavaSDK

  <T> void runTestWrite(T[] elems, Coder<T> coder) throws Exception {
    File tmpFile = tmpFolder.newFile("file.txt");
    String filename = tmpFile.getPath();

    Pipeline p = TestPipeline.create();

    PCollection<T> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(coder));

    TextIO.Write.Bound<T> write;
    if (coder.equals(StringUtf8Coder.of())) {
      TextIO.Write.Bound<String> writeStrings = TextIO.Write.to(filename).withoutSharding();
      // T==String
      write = (TextIO.Write.Bound<T>) writeStrings;
    } else {
      write = TextIO.Write.to(filename).withCoder(coder).withoutSharding();
    }

    input.apply(write);

    p.run();

    List<String> actual = new ArrayList<>();
    try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
      for (; ; ) {
        String line = reader.readLine();
        if (line == null) {
          break;
        }
        actual.add(line);
      }
    }

    String[] expected = new String[elems.length];
    for (int i = 0; i < elems.length; i++) {
      T elem = elems[i];
      byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem);
      String line = new String(encodedElem);
      expected[i] = line;
    }

    assertThat(actual, containsInAnyOrder(expected));
  }

Esempio n. 24

0

Mostra file

File: SmallBamWriter.java Progetto: laserson/hellbender

  /**
   * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted
   * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this
   * won't go well if you have too many.
   *
   * @param pipeline the pipeline to add this operation to.
   * @param reads the reads to write (they don't need to be sorted).
   * @param header the header that corresponds to the reads.
   * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS).
   * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies
   *     when writing to Hadoop
   */
  public static void writeToFile(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (BucketUtils.isHadoopUrl(destPath)
        || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) {
      writeToHadoop(pipeline, reads, header, destPath, parquet);
    } else {
      PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable());

      PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath));

      dummy.apply(
          ParDo.named("save to BAM file")
              .withSideInputs(iterableView)
              .of(new SaveToBAMFile(header, iterableView)));
    }
  }

Esempio n. 25

0

Mostra file

File: TextIOTest.java Progetto: latindignity/DataflowJavaSDK

  @Test
  public void testWriteSharded() throws IOException {
    File outFolder = tmpFolder.newFolder();
    String filename = outFolder.toPath().resolve("output").toString();

    Pipeline p = TestPipeline.create();

    PCollection<String> input =
        p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of()));

    input.apply(TextIO.Write.to(filename).withNumShards(2).withSuffix(".txt"));

    p.run();

    String[] files = outFolder.list();

    assertThat(
        Arrays.asList(files),
        containsInAnyOrder("output-00000-of-00002.txt", "output-00001-of-00002.txt"));
  }

Esempio n. 26

0

Mostra file

File: ApproximateUniqueTest.java Progetto: latindignity/DataflowJavaSDK

  private void runApproximateUniqueWithSkewedDistributions(
      int elementCount, final int uniqueCount, final int sampleSize) {
    List<Integer> elements = Lists.newArrayList();
    // Zipf distribution with approximately elementCount items.
    double s = 1 - 1.0 * uniqueCount / elementCount;
    double maxCount = Math.pow(uniqueCount, s);
    for (int k = 0; k < uniqueCount; k++) {
      int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s)));
      // Element k occurs count times.
      for (int c = 0; c < count; c++) {
        elements.add(k);
      }
    }

    Pipeline p = TestPipeline.create();
    PCollection<Integer> input = p.apply(Create.of(elements));
    PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize));

    DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

    p.run();
  }

Esempio n. 27

0

Mostra file

File: ReadsPreprocessingPipeline.java Progetto: davidadamsphd/hellbender

    @Override
    public PCollection<GATKRead> apply(PCollection<GATKRead> input) {
      return input.apply(
          ParDo.named("ApplyBQSR")
              .of(
                  new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(header, recalibrationReport));
    }

Esempio n. 28

0

Mostra file

File: PubsubIO.java Progetto: ssesha/DataflowJavaSDK

 @Override
 public PCollection<T> apply(PInput input) {
   if (topic == null && subscription == null) {
     throw new IllegalStateException(
         "need to set either the topic or the subscription for "
             + "a PubsubIO.Read transform");
   }
   if (topic != null && subscription != null) {
     throw new IllegalStateException(
         "Can't set both the topic and the subscription for a " + "PubsubIO.Read transform");
   }
   return PCollection.<T>createPrimitiveOutputInternal(
           input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED)
       .setCoder(coder);
 }

Esempio n. 29

0

Mostra file

File: SmallBamWriter.java Progetto: laserson/hellbender

  private static void writeToHadoop(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (destPath.equals("/dev/null")) {
      return;
    }

    String headerString =
        Base64.getEncoder().encodeToString(SerializableUtils.serializeToByteArray(header));

    @SuppressWarnings("unchecked")
    Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>> outputFormatClass =
        (Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>>)
            (Class<?>) TemplatedKeyIgnoringBAMOutputFormat.class;
    @SuppressWarnings("unchecked")
    HadoopIO.Write.Bound<NullWritable, SAMRecordWritable> write =
        HadoopIO.Write.to(destPath, outputFormatClass, NullWritable.class, SAMRecordWritable.class)
            .withConfigurationProperty(
                TemplatedKeyIgnoringBAMOutputFormat.SAM_HEADER_PROPERTY_NAME, headerString);

    PCollection<KV<NullWritable, SAMRecordWritable>> samReads =
        reads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, KV<NullWritable, SAMRecordWritable>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        SAMRecord samRecord = c.element().convertToSAMRecord(header);
                        SAMRecordWritable samRecordWritable = new SAMRecordWritable();
                        samRecordWritable.set(samRecord);
                        c.output(KV.of(NullWritable.get(), samRecordWritable));
                      }
                    }))
            .setCoder(
                KvCoder.of(
                    WritableCoder.of(NullWritable.class),
                    WritableCoder.of(SAMRecordWritable.class)));

    // write as a single (unsharded) file
    samReads.apply(write.withoutSharding());
  }

Esempio n. 30

0

Mostra file

File: BaseRecalibratorOptimizedTransform.java Progetto: broadinstitute/gatk-dataflow

  /**
   * addQuantizationInfo takes the computed RecalibrationTable and adds the QuantizationInfo and
   * RequestedCovariates objects. We call this triplet "BaseRecalOutput". It contains everything we
   * need from phase 1 to continue onto phase 2 of BQSR.
   */
  private static PCollection<BaseRecalOutput> addQuantizationInfo(
      PCollectionView<SAMFileHeader> headerView,
      RecalibrationArgumentCollection recalArgs,
      PCollection<RecalibrationTables> recal) {
    return recal.apply(
        ParDo.named("addQuantizationInfo")
            .withSideInputs(headerView)
            .of(
                new DoFnWLog<RecalibrationTables, BaseRecalOutput>("addQuantizationInfo") {
                  private static final long serialVersionUID = 1L;

                  @Override
                  public void processElement(ProcessContext c) throws IOException {
                    RecalibrationTables rt = c.element();
                    SAMFileHeader header = c.sideInput(headerView);
                    // BaseRecalOutput ret = new BaseRecalOutput(rt,
                    // baseRecalibratorWorker.getQuantizationInfo(rt),
                    // baseRecalibratorWorker.getRequestedCovariates());
                    // Saving and loading back the report actually changes it. So we have to do it.
                    // TODO(issue#799): Figure out what it changes, and just do that instead of
                    // doing the whole rigamarole.
                    File temp = IOUtils.createTempFile("temp-recalibrationtable-", ".tmp");
                    if (null == rt) {
                      // special case where we have zero reads in the input. Create a valid empty
                      // report.
                      log.debug("Special case: zero reads in input.");
                      BaseRecalibrationEngine recalibrationEngine =
                          new BaseRecalibrationEngine(recalArgs, header);
                      rt = recalibrationEngine.getRecalibrationTables();
                      BaseRecalibrationEngine.finalizeRecalibrationTables(rt);
                    }
                    try {
                      BaseRecalibratorOptimizedFn.saveTextualReport(temp, header, rt, recalArgs);
                      BaseRecalOutput ret = new BaseRecalOutput(temp);
                      c.output(ret);
                    } catch (FileNotFoundException e) {
                      throw new GATKException("can't find my own temporary file", e);
                    } catch (IOException e) {
                      throw new GATKException(
                          "unable to save temporary report to " + temp.getPath(), e);
                    }
                  }
                }));
  }