@Override public void processElement(ProcessContext c) throws Exception { String dest = c.element(); logger.info("Saving to " + dest); Iterable<GATKRead> reads = c.sideInput(iterableView); OutputStream outputStream = BucketUtils.createFile(dest, c.getPipelineOptions()); try (SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, outputStream)) { for (GATKRead r : reads) { final SAMRecord sr = r.convertToSAMRecord(header); writer.addAlignment(sr); } } }
/** * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this * won't go well if you have too many. * * @param pipeline the pipeline to add this operation to. * @param reads the reads to write (they don't need to be sorted). * @param header the header that corresponds to the reads. * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS). * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies * when writing to Hadoop */ public static void writeToFile( Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath, final boolean parquet) { if (BucketUtils.isHadoopUrl(destPath) || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) { writeToHadoop(pipeline, reads, header, destPath, parquet); } else { PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable()); PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath)); dummy.apply( ParDo.named("save to BAM file") .withSideInputs(iterableView) .of(new SaveToBAMFile(header, iterableView))); } }