@BeforeTest
  void setupBuilder() throws IOException {
    tempSamFileChrM_O = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
    tempSamFileAllChr = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
    tempSamFileChrM_O.deleteOnExit();
    tempSamFileAllChr.deleteOnExit();

    final File tempSamFileUnsorted = File.createTempFile("CollectGcBias", ".bam", TEST_DIR);
    tempSamFileUnsorted.deleteOnExit();

    final SAMFileHeader header = new SAMFileHeader();

    try {
      header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(dict));
      header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
    } catch (final SAMException e) {
      e.printStackTrace();
    }

    // build different levels to put into the same bam file for testing multi level collection
    setupTest1(
        1,
        readGroupId1,
        readGroupRecord1,
        sample1,
        library1,
        header,
        setBuilder1); // Sample 1, Library 1, RG 1
    setupTest1(
        2,
        readGroupId2,
        readGroupRecord2,
        sample1,
        library2,
        header,
        setBuilder2); // Sample 1, Library 2, RG 2
    setupTest1(
        3,
        readGroupId3,
        readGroupRecord3,
        sample2,
        library3,
        header,
        setBuilder3); // Sample 2, Library 3, RG 3

    // build one last readgroup for comparing that window count stays the same whether you use all
    // contigs or not
    setupTest2(1, readGroupId1, readGroupRecord1, sample1, library1, header, setBuilder4);

    final List<SAMRecordSetBuilder> test1Builders = new ArrayList<SAMRecordSetBuilder>();
    test1Builders.add(setBuilder1);
    test1Builders.add(setBuilder2);
    test1Builders.add(setBuilder3);

    final List<SAMRecordSetBuilder> test2Builders = new ArrayList<SAMRecordSetBuilder>();
    test2Builders.add(setBuilder4);

    tempSamFileChrM_O = build(test1Builders, tempSamFileUnsorted, header);
    tempSamFileAllChr = build(test2Builders, tempSamFileUnsorted, header);
  }
Exemplo n.º 2
0
  @Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized) close();
    isInitialized = true;

    final Configuration conf = ctx.getConfiguration();

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
    codec = new BAMRecordCodec(header);

    in.seek(0);
    bci =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
      final long recordStart = virtualStart & 0xffff;
      System.err.println(
          "XXX inizialized BAMRecordReader byte offset: "
              + fileStart
              + " record offset: "
              + recordStart);
    }

    keepReadPairsTogether =
        SortOrder.queryname.equals(header.getSortOrder())
            && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false);
    readPair = false;
    lastOfPair = false;
    intervals = BAMInputFormat.getIntervals(conf);
    if (intervals != null) {
      overlapDetector = new OverlapDetector<>(0, 0);
      overlapDetector.addAll(intervals, intervals);
    }
  }
  @Override
  protected void setup(final SAMFileHeader header, final File samFile) {
    final String outext =
        (null != FILE_EXTENSION) ? FILE_EXTENSION : ""; // Add a file extension if desired
    preAdapterSummaryOut =
        new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_SUMMARY_EXT + outext);
    preAdapterDetailsOut =
        new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_DETAILS_EXT + outext);
    baitBiasSummaryOut =
        new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_SUMMARY_EXT + outext);
    baitBiasDetailsOut =
        new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT + outext);

    IOUtil.assertFileIsWritable(preAdapterSummaryOut);
    IOUtil.assertFileIsWritable(preAdapterDetailsOut);
    IOUtil.assertFileIsWritable(baitBiasSummaryOut);
    IOUtil.assertFileIsWritable(baitBiasDetailsOut);

    for (final SAMReadGroupRecord rec : header.getReadGroups()) {
      samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
      libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY));
    }

    if (INTERVALS != null) {
      IOUtil.assertFileIsReadable(INTERVALS);
      intervalMask =
          new IntervalListReferenceSequenceMask(IntervalList.fromFile(INTERVALS).uniqued());
    }

    if (DB_SNP != null) {
      IOUtil.assertFileIsReadable(DB_SNP);
      dbSnpMask = new DbSnpBitSetUtil(DB_SNP, header.getSequenceDictionary());
    }

    // set record-level filters
    final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
    filters.add(new FailsVendorReadQualityFilter());
    filters.add(new NotPrimaryAlignmentFilter());
    filters.add(new DuplicateReadFilter());
    filters.add(new AlignedFilter(true)); // discard unmapped reads
    filters.add(new MappingQualityFilter(MINIMUM_MAPPING_QUALITY));
    if (!INCLUDE_UNPAIRED) {
      final int effectiveMaxInsertSize =
          (MAXIMUM_INSERT_SIZE == 0) ? Integer.MAX_VALUE : MAXIMUM_INSERT_SIZE;
      filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, effectiveMaxInsertSize));
    }
    recordFilter = new AggregateFilter(filters);

    // set up the artifact counters
    final String sampleAlias = StringUtil.join(",", new ArrayList<String>(samples));
    for (final String library : libraries) {
      artifactCounters.put(
          library, new ArtifactCounter(sampleAlias, library, CONTEXT_SIZE, TANDEM_READS));
    }
  }
  /** Ensure that basic read group splitting works. */
  @Test
  public void testSplitByReadGroup() {
    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10);
    read3.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10);
    read4.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read5 = ArtificialSAMUtils.createArtificialRead(header, "read5", 0, 1, 10);
    read5.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read6 = ArtificialSAMUtils.createArtificialRead(header, "read6", 0, 1, 10);
    read6.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read7 = ArtificialSAMUtils.createArtificialRead(header, "read7", 0, 1, 10);
    read7.setAttribute("RG", readGroupOne.getId());

    ReadBackedPileup pileup =
        new ReadBackedPileupImpl(
            null,
            Arrays.asList(read1, read2, read3, read4, read5, read6, read7),
            Arrays.asList(1, 1, 1, 1, 1, 1, 1));

    ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1");
    List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads();
    Assert.assertEquals(rg1Reads.size(), 4, "Wrong number of reads in read group rg1");
    Assert.assertEquals(
        rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(2), read6, "Read " + read6.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(3), read7, "Read " + read7.getReadName() + " should be in rg1 but isn't");

    ReadBackedPileup rg2Pileup = pileup.getPileupForReadGroup("rg2");
    List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads();
    Assert.assertEquals(rg2Reads.size(), 3, "Wrong number of reads in read group rg2");
    Assert.assertEquals(
        rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(2), read5, "Read " + read5.getReadName() + " should be in rg2 but isn't");
  }
  /** Ensure that splitting read groups still works when dealing with a sample-split pileup. */
  @Test
  public void testSplitBySample() {
    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    readGroupOne.setSample("sample1");
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");
    readGroupTwo.setSample("sample2");

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10);
    read3.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10);
    read4.setAttribute("RG", readGroupTwo.getId());

    ReadBackedPileupImpl sample1Pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read1, read3), Arrays.asList(1, 1));
    ReadBackedPileupImpl sample2Pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read2, read4), Arrays.asList(1, 1));
    Map<String, ReadBackedPileupImpl> sampleToPileupMap =
        new HashMap<String, ReadBackedPileupImpl>();
    sampleToPileupMap.put(readGroupOne.getSample(), sample1Pileup);
    sampleToPileupMap.put(readGroupTwo.getSample(), sample2Pileup);

    ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null, sampleToPileupMap);

    ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1");
    List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads();

    Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1");
    Assert.assertEquals(
        rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't");

    ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2");
    List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads();

    Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2");
    Assert.assertEquals(
        rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't");
  }
Exemplo n.º 6
0
  /**
   * Returns the reference index in the given header of the contig of the read's mate, or {@link
   * SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped.
   *
   * @param read read whose mate's reference index to look up
   * @param header SAM header defining contig indices
   * @return the reference index in the given header of the contig of the read's mate, or {@link
   *     SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped.
   */
  public static int getMateReferenceIndex(final GATKRead read, final SAMFileHeader header) {
    if (read.mateIsUnmapped()) {
      return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
    }

    return header.getSequenceIndex(read.getMateContig());
  }
Exemplo n.º 7
0
  private void saveResults(
      final MetricsFile<?, Integer> metrics,
      final SAMFileHeader readsHeader,
      final String inputFileName) {
    MetricsUtils.saveMetrics(metrics, out, getAuthHolder());

    if (metrics.getAllHistograms().isEmpty()) {
      logger.warn("No valid bases found in input file.");
    } else if (chartOutput != null) {
      // Now run R to generate a chart

      // If we're working with a single library, assign that library's name
      // as a suffix to the plot title
      final List<SAMReadGroupRecord> readGroups = readsHeader.getReadGroups();

      /*
       * A subtitle for the plot, usually corresponding to a library.
       */
      String plotSubtitle = "";
      if (readGroups.size() == 1) {
        plotSubtitle = StringUtil.asEmptyIfNull(readGroups.get(0).getLibrary());
      }
      final RScriptExecutor executor = new RScriptExecutor();
      executor.addScript(new Resource(MeanQualityByCycle.R_SCRIPT, MeanQualityByCycle.class));
      executor.addArgs(out, chartOutput.getAbsolutePath(), inputFileName, plotSubtitle);
      executor.exec();
    }
  }
  /**
   * Construct an artificial SAM file reader with the given SAM file header
   *
   * @param customHeader Header that should be returned by calls to getFileHeader() on this reader
   * @param reads Reads to use as backing data source.
   */
  public ArtificialSAMFileReader(SAMFileHeader customHeader, SAMRecord... reads) {
    super(createEmptyInputStream(), true);

    this.customHeader = customHeader;
    this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary());
    this.reads = Arrays.asList(reads);
  }
Exemplo n.º 9
0
 public List<String> getSequenceNames() {
   if (sequenceNames == null) {
     SAMFileHeader header = getFileHeader();
     if (header == null) {
       return null;
     }
     sequenceNames = new ArrayList();
     List<SAMSequenceRecord> records = header.getSequenceDictionary().getSequences();
     if (records.size() > 0) {
       for (SAMSequenceRecord rec : header.getSequenceDictionary().getSequences()) {
         String chr = rec.getSequenceName();
         sequenceNames.add(chr);
       }
     }
   }
   return sequenceNames;
 }
  @Test
  public void testAddCommentsToBam() throws Exception {
    final File outputFile =
        File.createTempFile("addCommentsToBamTest.", BamFileIoUtils.BAM_FILE_EXTENSION);
    runIt(BAM_FILE, outputFile, commentList);

    final SAMFileHeader newHeader = SamReaderFactory.makeDefault().getFileHeader(outputFile);

    // The original comments are massaged when they're added to the header. Perform the same
    // massaging here,
    // and then compare the lists
    final List<String> massagedComments = new LinkedList<>();
    for (final String comment : commentList) {
      massagedComments.add(SAMTextHeaderCodec.COMMENT_PREFIX + comment);
    }

    Assert.assertEquals(newHeader.getComments(), massagedComments);
  }
Exemplo n.º 11
0
 @Test
 public void realignAtContigBorderTest() {
   final int contigEnd = header.getSequence(0).getSequenceLength();
   final GATKSAMRecord read =
       ArtificialSAMUtils.createArtificialRead(header, "goodRead", 0, contigEnd - 1, 2);
   read.setCigarString("2M");
   Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), false);
   read.setCigarString("1M1D1M");
   Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), true);
 }
Exemplo n.º 12
0
 /**
  * Check to ensure that the alignment makes sense based on the contents of the header.
  *
  * @param header The SAM file header.
  * @param read The read to verify.
  * @return true if alignment agrees with header, false otherwise.
  */
 public static boolean alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read) {
   final int referenceIndex = getReferenceIndex(read, header);
   // Read is aligned to nonexistent contig
   if (!read.isUnmapped() && referenceIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
     return false;
   }
   final SAMSequenceRecord contigHeader = header.getSequence(referenceIndex);
   // Read is aligned to a point after the end of the contig
   return read.isUnmapped() || read.getStart() <= contigHeader.getSequenceLength();
 }
Exemplo n.º 13
0
  @Test
  public void testGetPileupForSample() {
    String sample1 = "sample1";
    String sample2 = "sample2";

    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    readGroupOne.setSample(sample1);
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");
    readGroupTwo.setSample(sample2);

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());

    Map<String, ReadBackedPileupImpl> sampleToPileupMap =
        new HashMap<String, ReadBackedPileupImpl>();
    sampleToPileupMap.put(
        sample1, new ReadBackedPileupImpl(null, Collections.singletonList(read1), 0));
    sampleToPileupMap.put(
        sample2, new ReadBackedPileupImpl(null, Collections.singletonList(read2), 0));

    ReadBackedPileup pileup = new ReadBackedPileupImpl(null, sampleToPileupMap);

    ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2);
    Assert.assertEquals(
        sample2Pileup.getNumberOfElements(), 1, "Sample 2 pileup has wrong number of elements");
    Assert.assertEquals(
        sample2Pileup.getReads().get(0), read2, "Sample 2 pileup has incorrect read");

    ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing");
    Assert.assertNull(missingSamplePileup, "Pileup for sample 'missing' should be null but isn't");

    missingSamplePileup = pileup.getPileupForSample("not here");
    Assert.assertNull(missingSamplePileup, "Pileup for sample 'not here' should be null but isn't");
  }
  @Override
  protected void setupPipeline(Pipeline pipeline) {
    // Load the reads.
    final ReadsDataflowSource readsDataflowSource = new ReadsDataflowSource(bam, pipeline);
    final SAMFileHeader readsHeader = readsDataflowSource.getHeader();
    final List<SimpleInterval> intervals =
        intervalArgumentCollection.intervalsSpecified()
            ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary())
            : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary());

    final PCollectionView<SAMFileHeader> headerSingleton =
        ReadsDataflowSource.getHeaderView(pipeline, readsHeader);
    final PCollection<GATKRead> initialReads = readsDataflowSource.getReadPCollection(intervals);

    // Apply MarkDuplicates to produce updated GATKReads.
    final PCollection<GATKRead> markedReads =
        initialReads.apply(new MarkDuplicates(headerSingleton));

    // Load the Variants and the Reference and join them to reads.
    final VariantsDataflowSource variantsDataflowSource =
        new VariantsDataflowSource(baseRecalibrationKnownVariants, pipeline);

    Map<String, String> referenceNameToIdTable =
        RefAPISource.buildReferenceNameToIdTable(pipeline.getOptions(), referenceName);
    RefAPIMetadata refAPIMetadata = new RefAPIMetadata(referenceName, referenceNameToIdTable);

    final PCollection<KV<GATKRead, ReadContextData>> readsWithContext =
        AddContextDataToRead.add(markedReads, refAPIMetadata, variantsDataflowSource);

    // Apply BQSR.
    final PCollection<RecalibrationTables> recalibrationReports =
        readsWithContext.apply(new BaseRecalibratorStub(headerSingleton));
    final PCollectionView<RecalibrationTables> mergedRecalibrationReport =
        recalibrationReports.apply(View.<RecalibrationTables>asSingleton());

    final PCollection<GATKRead> finalReads =
        markedReads.apply(new ApplyBQSRStub(headerSingleton, mergedRecalibrationReport));
    SmallBamWriter.writeToFile(pipeline, finalReads, readsHeader, output);
  }
Exemplo n.º 15
0
  @Override
  protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    if (INPUT.getAbsolutePath().endsWith(".sam")) {
      throw new UserException("SAM files are not supported");
    }

    final SAMFileHeader samFileHeader =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(INPUT);
    for (final String comment : COMMENT) {
      if (comment.contains("\n")) {
        throw new UserException("Comments can not contain a new line");
      }
      samFileHeader.addComment(comment);
    }

    BamFileIoUtils.reheaderBamFile(samFileHeader, INPUT, OUTPUT, CREATE_MD5_FILE, CREATE_INDEX);

    return null;
  }
Exemplo n.º 16
0
  /**
   * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the
   * factory has been set to create an index, the header must be set to coordinate sorted.
   *
   * @param outputFile if this file has a .cram extension then a reference is required. Can not be
   *     null.
   * @param referenceFile the reference source to use. Can not be null if a output file has a .cram
   *     extension.
   * @param header header to be used for the output writer
   * @param preSorted if true then records must already be sorted to match the header sort order
   * @param factory SAMFileWriterFactory factory to use
   * @return SAMFileWriter
   */
  public static SAMFileWriter createCommonSAMWriterFromFactory(
      final SAMFileWriterFactory factory,
      final File outputFile,
      final File referenceFile,
      final SAMFileHeader header,
      final boolean preSorted) {
    Utils.nonNull(outputFile);
    Utils.nonNull(header);

    if (null == referenceFile && outputFile.getName().endsWith(CramIO.CRAM_FILE_EXTENSION)) {
      throw new UserException("A reference file is required for writing CRAM files");
    }

    return factory.makeWriter(header.clone(), preSorted, outputFile, referenceFile);
  }
  public void onTraversalDone(EventPair sum) {
    if (sum.left != null && sum.left.isReportableEvent()) sum.intervals.add(sum.left.getLoc());
    if (sum.right != null && sum.right.isReportableEvent()) sum.intervals.add(sum.right.getLoc());

    if (FilenameUtils.getExtension(out.getName()).equals("interval_list")) {
      final SAMFileHeader masterSequenceDictionaryHeader = new SAMFileHeader();
      masterSequenceDictionaryHeader.setSequenceDictionary(
          getToolkit().getMasterSequenceDictionary());
      final IntervalList intervalList = new IntervalList(masterSequenceDictionaryHeader);
      for (GenomeLoc loc : sum.intervals) {
        intervalList.add(new Interval(loc.getContig(), loc.getStart(), loc.getStop()));
      }
      intervalList.write(out);
    } else {
      try (BufferedWriter bufferedWriter = IOUtil.openFileForBufferedWriting(out)) {
        for (GenomeLoc loc : sum.intervals) {
          bufferedWriter.write(loc.toString());
          bufferedWriter.newLine();
        }
      } catch (final IOException e) {
        throw new GATKException("Error writing out intervals to file: " + out.getAbsolutePath(), e);
      }
    }
  }
Exemplo n.º 18
0
  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (!(o instanceof Pulldown)) {
      return false;
    }
    if (!super.equals(o)) {
      return false;
    }

    final Pulldown pulldown = (Pulldown) o;
    return header.equals(pulldown.header);
  }
  /////////////////////////////////////////////////////////////////////////////
  // Used to generate the Sam Record Sets with SamRecordSetBuilder.addPair().
  // testNumber 1: runGcBiasMultiLevelTest, generates records aligning to chrM and chrO
  // testNumber 2: runWindowsComparisonTest, generates records aligning to chrM,N,O.
  /////////////////////////////////////////////////////////////////////////////
  public void setupTest1(
      final int ID,
      final String readGroupId,
      final SAMReadGroupRecord readGroupRecord,
      final String sample,
      final String library,
      final SAMFileHeader header,
      final SAMRecordSetBuilder setBuilder)
      throws IOException {

    final String separator = ":";
    final int contig1 = 0;
    final int contig2 = 1;
    readGroupRecord.setSample(sample);
    readGroupRecord.setPlatform(platform);
    readGroupRecord.setLibrary(library);
    readGroupRecord.setPlatformUnit(readGroupId);
    header.addReadGroup(readGroupRecord);
    setBuilder.setReadGroup(readGroupRecord);
    setBuilder.setUseNmFlag(true);

    setBuilder.setHeader(header);

    final int max = 800;
    final int min = 1;
    final Random rg = new Random(5);

    // add records that align to chrM and O but not N
    for (int i = 0; i < NUM_READS; i++) {
      final int start = rg.nextInt(max) + min;
      final String newReadName = READ_NAME + separator + ID + separator + i;

      if (i != NUM_READS - 1) {
        setBuilder.addPair(newReadName, contig1, start + ID, start + ID + LENGTH);
      } else {
        setBuilder.addPair(newReadName, contig2, start + ID, start + ID + LENGTH);
      }
    }
  }
Exemplo n.º 20
0
  /**
   * Create a common SAMFileWriter for use with GATK tools.
   *
   * @param outputFile - if this file has a .cram extension then a reference is required. Can not be
   *     null.
   * @param referenceFile - the reference source to use. Can not be null if a output file has a
   *     .cram extension.
   * @param header - header to be used for the output writer
   * @param preSorted - if true then the records must already be sorted to match the header sort
   *     order
   * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files
   * @param createMD5 - if true an MD5 file will be created
   * @return SAMFileWriter
   */
  public static SAMFileWriter createCommonSAMWriter(
      final File outputFile,
      final File referenceFile,
      final SAMFileHeader header,
      final boolean preSorted,
      boolean createOutputBamIndex,
      final boolean createMD5) {
    Utils.nonNull(outputFile);
    Utils.nonNull(header);

    if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
      logger.warn(
          "Skipping index file creation for: "
              + outputFile.getAbsolutePath()
              + ". Index file creation requires reads in coordinate sorted order.");
      createOutputBamIndex = false;
    }

    final SAMFileWriterFactory factory =
        new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5);
    return ReadUtils.createCommonSAMWriterFromFactory(
        factory, outputFile, referenceFile, header, preSorted);
  }
Exemplo n.º 21
0
 /**
  * Returns a {@link SAMReadGroupRecord} object corresponding to the provided read's read group.
  *
  * @param read read whose read group to retrieve
  * @param header SAM header containing read groups
  * @return a {@link SAMReadGroupRecord} object corresponding to the provided read's read group, or
  *     null if the read has no read group
  */
 public static SAMReadGroupRecord getSAMReadGroupRecord(
     final GATKRead read, final SAMFileHeader header) {
   final String readGroupName = read.getReadGroup();
   return readGroupName != null ? header.getReadGroup(readGroupName) : null;
 }
Exemplo n.º 22
0
 public static List<String> getReadGroupIDs(final SAMFileHeader header) {
   return header.getReadGroups().stream().map(rg -> getID(rg)).collect(Collectors.toList());
 }
Exemplo n.º 23
0
  /** Combines multiple SAM/BAM files into one. */
  @Override
  protected int doWork() {
    boolean matchedSortOrders = true;

    // read interval list if it is defined
    final List<Interval> intervalList =
        (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals());
    // map reader->iterator used if INTERVALS is defined
    final Map<SamReader, CloseableIterator<SAMRecord>> samReaderToIterator =
        new HashMap<SamReader, CloseableIterator<SAMRecord>>(INPUT.size());

    // Open the files for reading and writing
    final List<SamReader> readers = new ArrayList<SamReader>();
    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
    {
      SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory

      for (final File inFile : INPUT) {
        IOUtil.assertFileIsReadable(inFile);
        final SamReader in =
            SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile);
        if (INTERVALS != null) {
          if (!in.hasIndex())
            throw new PicardException(
                "Merging with interval but Bam file is not indexed " + inFile);
          final CloseableIterator<SAMRecord> samIterator =
              new SamRecordIntervalIteratorFactory()
                  .makeSamRecordIntervalIterator(in, intervalList, true);
          samReaderToIterator.put(in, samIterator);
        }

        readers.add(in);
        headers.add(in.getFileHeader());

        // A slightly hackish attempt to keep memory consumption down when merging multiple files
        // with
        // large sequence dictionaries (10,000s of sequences). If the dictionaries are identical,
        // then
        // replace the duplicate copies with a single dictionary to reduce the memory footprint.
        if (dict == null) {
          dict = in.getFileHeader().getSequenceDictionary();
        } else if (dict.equals(in.getFileHeader().getSequenceDictionary())) {
          in.getFileHeader().setSequenceDictionary(dict);
        }

        matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
      }
    }

    // If all the input sort orders match the output sort order then just merge them and
    // write on the fly, otherwise setup to merge and sort before writing out the final file
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean presorted;
    final SAMFileHeader.SortOrder headerMergerSortOrder;
    final boolean mergingSamRecordIteratorAssumeSorted;

    if (matchedSortOrders
        || SORT_ORDER == SAMFileHeader.SortOrder.unsorted
        || ASSUME_SORTED
        || INTERVALS != null) {
      log.info(
          "Input files are in same order as output so sorting to temp directory is not needed.");
      headerMergerSortOrder = SORT_ORDER;
      mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED;
      presorted = true;
    } else {
      log.info("Sorting input files using temp directory " + TMP_DIR);
      headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted;
      mergingSamRecordIteratorAssumeSorted = false;
      presorted = false;
    }
    final SamFileHeaderMerger headerMerger =
        new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES);
    final MergingSamRecordIterator iterator;
    // no interval defined, get an iterator for the whole bam
    if (intervalList == null) {
      iterator =
          new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted);
    } else {
      // show warning related to https://github.com/broadinstitute/picard/pull/314/files
      log.info(
          "Warning: merged bams from different interval lists may contain the same read in both files");
      iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true);
    }
    final SAMFileHeader header = headerMerger.getMergedHeader();
    for (final String comment : COMMENT) {
      header.addComment(comment);
    }
    header.setSortOrder(SORT_ORDER);
    final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
    if (USE_THREADING) {
      samFileWriterFactory.setUseAsyncIo(true);
    }
    final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT);

    // Lastly loop through and write out the records
    final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL);
    while (iterator.hasNext()) {
      final SAMRecord record = iterator.next();
      out.addAlignment(record);
      progress.record(record);
    }

    log.info("Finished reading inputs.");
    for (final CloseableIterator<SAMRecord> iter : samReaderToIterator.values())
      CloserUtil.close(iter);
    CloserUtil.close(readers);
    out.close();
    return 0;
  }
Exemplo n.º 24
0
 @BeforeClass
 public void beforeClass() {
   header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
   genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
   loc = genomeLocParser.createGenomeLoc("chr1", 1);
 }
Exemplo n.º 25
0
 @Override
 public int hashCode() {
   int result = super.hashCode();
   result = 31 * result + header.hashCode();
   return result;
 }
Exemplo n.º 26
0
 @BeforeClass
 public void init() {
   header = ArtificialReadUtils.createArtificialSamHeader(1, 1, 1000000);
   genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
 }
 private SAMFileHeader makeHeader() {
   final SAMFileHeader header = new SAMFileHeader();
   final SAMSequenceDictionary dict = header.getSequenceDictionary();
   dict.addSequence(new SAMSequenceRecord("chr20", 62435964));
   return header;
 }
Exemplo n.º 28
0
  /**
   * Test that PG header records are created & chained appropriately (or not created), and that the
   * PG record chains are as expected. MarkDuplicates is used both to merge and to mark dupes in
   * this case.
   *
   * @param suppressPg If true, do not create PG header record.
   * @param expectedPnVnByReadName For each read, info about the expect chain of PG records.
   */
  @Test(dataProvider = "pgRecordChainingTest")
  public void pgRecordChainingTest(
      final boolean suppressPg, final Map<String, List<ExpectedPnAndVn>> expectedPnVnByReadName) {
    final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp");
    outputDir.deleteOnExit();
    try {
      // Run MarkDuplicates, merging the 3 input files, and either enabling or suppressing PG header
      // record creation according to suppressPg.
      final MarkDuplicates markDuplicates = new MarkDuplicates();
      final ArrayList<String> args = new ArrayList<String>();
      for (int i = 1; i <= 3; ++i) {
        args.add("INPUT=" + new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath());
      }
      final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam");
      args.add("OUTPUT=" + outputSam.getAbsolutePath());
      args.add(
          "METRICS_FILE="
              + new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics").getAbsolutePath());
      if (suppressPg) args.add("PROGRAM_RECORD_ID=null");

      // I generally prefer to call doWork rather than invoking the argument parser, but it is
      // necessary
      // in this case to initialize the command line.
      // Note that for the unit test, version won't come through because it is obtained through jar
      // manifest, and unit test doesn't run code from a jar.
      Assert.assertEquals(markDuplicates.instanceMain(args.toArray(new String[args.size()])), 0);

      // Read the MarkDuplicates output file, and get the PG ID for each read.  In this particular
      // test,
      // the PG ID should be the same for both ends of a pair.
      final SamReader reader = SamReaderFactory.makeDefault().open(outputSam);

      final Map<String, String> pgIdForReadName = new HashMap<String, String>();
      for (final SAMRecord rec : reader) {
        final String existingPgId = pgIdForReadName.get(rec.getReadName());
        final String thisPgId = rec.getStringAttribute(SAMTag.PG.name());
        if (existingPgId != null) {
          Assert.assertEquals(thisPgId, existingPgId);
        } else {
          pgIdForReadName.put(rec.getReadName(), thisPgId);
        }
      }
      final SAMFileHeader header = reader.getFileHeader();
      CloserUtil.close(reader);

      // Confirm that for each read name, the chain of PG records contains exactly the number that
      // is expected,
      // and that values in the PG chain are as expected.
      for (final Map.Entry<String, List<ExpectedPnAndVn>> entry :
          expectedPnVnByReadName.entrySet()) {
        final String readName = entry.getKey();
        final List<ExpectedPnAndVn> expectedList = entry.getValue();
        String pgId = pgIdForReadName.get(readName);
        for (final ExpectedPnAndVn expected : expectedList) {
          final SAMProgramRecord programRecord = header.getProgramRecord(pgId);
          if (expected.expectedPn != null)
            Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn);
          if (expected.expectedVn != null)
            Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn);
          pgId = programRecord.getPreviousProgramGroupId();
        }
        Assert.assertNull(pgId);
      }

    } finally {
      TestUtil.recursiveDelete(outputDir);
    }
  }
Exemplo n.º 29
0
  @Override
  public int doWork(String[] args) {
    boolean compressed = false;
    int maxRecordsInRAM = 100000;
    long count = -1L;
    File fileout = null;
    com.github.lindenb.jvarkit.util.cli.GetOpt opt =
        new com.github.lindenb.jvarkit.util.cli.GetOpt();
    int c;
    while ((c = opt.getopt(args, getGetOptDefault() + "o:n:N:T:b")) != -1) {
      switch (c) {
        case 'b':
          compressed = true;
          break;
        case 'N':
          maxRecordsInRAM = Integer.parseInt(opt.getOptArg());
          break;
        case 'n':
          count = Long.parseLong(opt.getOptArg());
          break;
        case 'o':
          fileout = new File(opt.getOptArg());
          break;
        case 'T':
          this.addTmpDirectory(new File(opt.getOptArg()));
          break;
        default:
          {
            switch (handleOtherOptions(c, opt, null)) {
              case EXIT_FAILURE:
                return -1;
              case EXIT_SUCCESS:
                return 0;
              default:
                break;
            }
          }
      }
    }
    if (count < -1L) // -1 == infinite
    {
      error("Bad count:" + count);
      return -1;
    }
    SamReader samReader = null;
    SAMRecordIterator iter = null;
    SAMFileWriter samWriter = null;
    Random random = new Random();
    CloseableIterator<RandSamRecord> iter2 = null;
    try {
      SamFileReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT);
      if (opt.getOptInd() == args.length) {
        info("Reading from stdin");
        samReader = SamFileReaderFactory.mewInstance().openStdin();
      } else if (opt.getOptInd() + 1 == args.length) {
        File filename = new File(args[opt.getOptInd()]);
        info("Reading from " + filename);
        samReader = SamFileReaderFactory.mewInstance().open(filename);
      } else {
        error("Illegal number of arguments.");
        return -1;
      }
      SAMFileHeader header = samReader.getFileHeader();

      header = header.clone();
      header.setSortOrder(SortOrder.unsorted);
      header.addComment("Processed with " + getProgramName() + " : " + getProgramCommandLine());
      SAMFileWriterFactory sfw = new SAMFileWriterFactory();
      sfw.setCreateIndex(false);
      sfw.setCreateMd5File(false);
      if (fileout == null) {
        if (compressed) {
          samWriter = sfw.makeBAMWriter(header, true, System.out);
        } else {
          samWriter = sfw.makeSAMWriter(header, true, System.out);
        }
      } else {
        samWriter = sfw.makeSAMOrBAMWriter(header, true, fileout);
        this.addTmpDirectory(fileout);
      }
      iter = samReader.iterator();
      SAMSequenceDictionaryProgress progress =
          new SAMSequenceDictionaryProgress(samReader.getFileHeader().getSequenceDictionary());

      SortingCollection<RandSamRecord> sorter =
          SortingCollection.newInstance(
              RandSamRecord.class,
              new RandSamRecordCodec(header),
              new RandSamRecordComparator(),
              maxRecordsInRAM,
              getTmpDirectories());
      sorter.setDestructiveIteration(true);
      while (iter.hasNext()) {
        RandSamRecord r = new RandSamRecord();
        r.rand_index = random.nextInt();
        r.samRecord = progress.watch(iter.next());

        sorter.add(r);
      }
      iter.close();
      iter = null;

      sorter.doneAdding();
      iter2 = sorter.iterator();
      if (count == -1) {
        while (iter2.hasNext()) {
          samWriter.addAlignment(iter2.next().samRecord);
        }
      } else {
        while (iter2.hasNext() && count > 0) {
          samWriter.addAlignment(iter2.next().samRecord);
          count--;
        }
      }
      iter2.close();
      iter2 = null;
      sorter.cleanup();
      progress.finish();
    } catch (Exception e) {
      error(e);
      return -1;
    } finally {
      CloserUtil.close(iter);
      CloserUtil.close(iter2);
      CloserUtil.close(samReader);
      CloserUtil.close(samWriter);
    }
    return 0;
  }
Exemplo n.º 30
0
 /**
  * HACK: This is used to make a copy of a header. Really, SAMFileHeader should provide a copy
  * constructor or a factory method.
  */
 public static SAMFileHeader cloneSAMFileHeader(final SAMFileHeader header) {
   if (header == null) return null;
   return header.clone();
 }