Exemplo n.º 1
0
  @Override
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);

    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker =
        new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
    final SamLocusIterator iterator = getLocusIterator(in);

    final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    if (!COUNT_UNPAIRED) {
      filters.add(pairFilter);
    }
    filters.add(
        new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count
                                         // reads twice
    iterator.setSamFilters(filters);
    iterator.setEmitUncoveredLoci(true);
    iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setIncludeNonPfReads(false);

    final int max = COVERAGE_CAP;
    final long[] HistogramArray = new long[max + 1];
    final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
    final boolean usingStopAfter = STOP_AFTER > 0;
    final long stopAfter = STOP_AFTER - 1;
    long counter = 0;

    long basesExcludedByBaseq = 0;
    long basesExcludedByOverlap = 0;
    long basesExcludedByCapping = 0;

    // Loop through all the loci
    while (iterator.hasNext()) {
      final SamLocusIterator.LocusInfo info = iterator.next();

      // Check that the reference is not N
      final ReferenceSequence ref = refWalker.get(info.getSequenceIndex());
      final byte base = ref.getBases()[info.getPosition() - 1];
      if (base == 'N') continue;

      // Figure out the coverage while not counting overlapping reads twice, and excluding various
      // things
      final HashSet<String> readNames = new HashSet<String>(info.getRecordAndPositions().size());
      int pileupSize = 0;
      for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) {

        if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) {
          ++basesExcludedByBaseq;
          continue;
        }
        if (!readNames.add(recs.getRecord().getReadName())) {
          ++basesExcludedByOverlap;
          continue;
        }
        pileupSize++;
        if (pileupSize <= max) {
          baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
        }
      }

      final int depth = Math.min(readNames.size(), max);
      if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - max;
      HistogramArray[depth]++;

      // Record progress and perhaps stop
      progress.record(info.getSequenceName(), info.getPosition());
      if (usingStopAfter && ++counter > stopAfter) break;
    }

    // Construct and write the outputs
    final Histogram<Integer> histo = new Histogram<Integer>("coverage", "count");
    for (int i = 0; i < HistogramArray.length; ++i) {
      histo.increment(i, HistogramArray[i]);
    }

    // Construct and write the outputs
    final Histogram<Integer> baseQHisto = new Histogram<Integer>("value", "baseq_count");
    for (int i = 0; i < baseQHistogramArray.length; ++i) {
      baseQHisto.increment(i, baseQHistogramArray[i]);
    }

    final WgsMetrics metrics = generateWgsMetrics();
    metrics.GENOME_TERRITORY = (long) histo.getSumOfValues();
    metrics.MEAN_COVERAGE = histo.getMean();
    metrics.SD_COVERAGE = histo.getStandardDeviation();
    metrics.MEDIAN_COVERAGE = histo.getMedian();
    metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();

    final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter);
    final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter);
    final long basesExcludedByPairing = getBasesExcludedBy(pairFilter);
    final double total = histo.getSum();
    final double totalWithExcludes =
        total
            + basesExcludedByDupes
            + basesExcludedByMapq
            + basesExcludedByPairing
            + basesExcludedByBaseq
            + basesExcludedByOverlap
            + basesExcludedByCapping;
    metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
    metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
    metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
    metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
    metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
    metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
    metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;

    metrics.PCT_1X =
        MathUtil.sum(HistogramArray, 1, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_5X =
        MathUtil.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_10X =
        MathUtil.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_15X =
        MathUtil.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_20X =
        MathUtil.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_25X =
        MathUtil.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_30X =
        MathUtil.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_40X =
        MathUtil.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_50X =
        MathUtil.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_60X =
        MathUtil.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_70X =
        MathUtil.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_80X =
        MathUtil.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_90X =
        MathUtil.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_100X =
        MathUtil.sum(HistogramArray, 100, HistogramArray.length)
            / (double) metrics.GENOME_TERRITORY;

    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(histo);
    if (INCLUDE_BQ_HISTOGRAM) {
      out.addHistogram(baseQHisto);
    }
    out.write(OUTPUT);

    return 0;
  }
  @Override
  public void accumulate(final VariantContext ctx) {
    logger.record(ctx.getContig(), ctx.getStart());

    final String variantChrom = ctx.getContig();
    final int variantPos = ctx.getStart();

    // Skip anything a little too funky
    if (ctx.isFiltered()) return;
    if (!ctx.isVariant()) return;
    if (SKIP_CHROMS.contains(variantChrom)) return;

    for (final MendelianViolationMetrics trio : trios) {
      final Genotype momGt = ctx.getGenotype(trio.MOTHER);
      final Genotype dadGt = ctx.getGenotype(trio.FATHER);
      final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING);

      // if any genotype:
      // - has a non-snp allele; or
      // - lacks a reference allele
      //
      // then ignore this trio
      if (CollectionUtil.makeList(momGt, dadGt, kidGt)
          .stream()
          .anyMatch(
              gt ->
                  gt.isHetNonRef()
                      || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream())
                          .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) {
        continue;
      }

      // if between the trio there are more than 2 alleles including the reference, continue
      if (Stream.concat(
                  Collections.singleton(ctx.getReference()).stream(),
                  CollectionUtil.makeList(momGt, dadGt, kidGt)
                      .stream()
                      .flatMap(gt -> gt.getAlleles().stream()))
              .collect(Collectors.toSet())
              .size()
          > 2) continue;

      // Test to make sure:
      //   1) That the site is in fact variant in the trio
      //   2) that the offspring doesn't have a really wacky het allele balance
      if (!isVariant(momGt, dadGt, kidGt)) continue;
      if (kidGt.isHet()) {
        final int[] ad = kidGt.getAD();
        if (ad == null) continue;

        final List<Integer> adOfAlleles =
            kidGt
                .getAlleles()
                .stream()
                .map(a -> ad[ctx.getAlleleIndex(a)])
                .collect(Collectors.toList());
        final double minAlleleFraction =
            Math.min(adOfAlleles.get(0), adOfAlleles.get(1))
                / (double) (adOfAlleles.get(0) + adOfAlleles.get(1));
        if (minAlleleFraction < MIN_HET_FRACTION) continue;
      }

      ///////////////////////////////////////////////////////////////
      // Determine whether the offspring should be haploid at this
      // locus and which is the parental donor of the haploid genotype
      ///////////////////////////////////////////////////////////////
      boolean haploid = false;
      Genotype haploidParentalGenotype = null;

      if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) {
        if (trio.OFFSPRING_SEX == Sex.Female) {
          // famale
          haploid = false;
        } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) {
          // male but in PAR on X, so diploid
          haploid = false;
        } else {
          // male, out of PAR on X, haploid
          haploid = true;
          haploidParentalGenotype = momGt;
        }
      }

      // the PAR on the male chromosome should be masked so that reads
      // align to the female chromosomes instead, so there's no point
      // of worrying about that here.

      if (MALE_CHROMS.contains(variantChrom)) {
        if (trio.OFFSPRING_SEX == Sex.Male) {
          haploid = true;
          haploidParentalGenotype = dadGt;
        } else {
          continue;
        }
      }

      // We only want to look at sites where we have high enough confidence that the genotypes we
      // are looking at are
      // interesting.  We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is
      // either GQ>=MIN_GQ or in the
      // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ.
      if (haploid
          && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ))
        continue;
      if (!haploid
          && (momGt.isNoCall()
              || momGt.getGQ() < MIN_GQ
              || dadGt.isNoCall()
              || dadGt.getGQ() < MIN_GQ)) continue;
      if (kidGt.isNoCall()) continue;
      if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) {
        if (kidGt.getPL()[0] < MIN_GQ) continue;
      } else if (kidGt.getGQ() < MIN_GQ) continue;

      // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too
      // low
      if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue;
      if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP))
        continue;

      trio.NUM_VARIANT_SITES++;

      ///////////////////////////////////////////////////////////////
      // First test for haploid violations
      ///////////////////////////////////////////////////////////////
      MendelianViolation type = null;
      if (haploid) {
        if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions

        if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) {
          if (kidGt.isHomRef()) {
            type = MendelianViolation.Haploid_Other;
            trio.NUM_HAPLOID_OTHER++;
          } else {
            type = MendelianViolation.Haploid_Denovo;
            trio.NUM_HAPLOID_DENOVO++;
          }
        }
      }
      ///////////////////////////////////////////////////////////////
      // Then test for diploid mendelian violations
      ///////////////////////////////////////////////////////////////
      else if (isMendelianViolation(momGt, dadGt, kidGt)) {
        if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) {
          trio.NUM_DIPLOID_DENOVO++;
          type = MendelianViolation.Diploid_Denovo;
        } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) {
          trio.NUM_HOMVAR_HOMVAR_HET++;
          type = MendelianViolation.HomVar_HomVar_Het;
        } else if (kidGt.isHom()
            && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) {
          trio.NUM_HOMREF_HOMVAR_HOM++;
          type = MendelianViolation.HomRef_HomVar_Hom;
        } else if (kidGt.isHom()
            && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) {
          trio.NUM_HOM_HET_HOM++;
          type = MendelianViolation.Hom_Het_Hom;
        } else {
          trio.NUM_OTHER++;
          type = MendelianViolation.Other;
        }
      }

      // Output a record into the family's violation VCF
      if (type != null) {
        // Create a new Context subsetted to the three samples
        final VariantContextBuilder builder = new VariantContextBuilder(ctx);
        builder.genotypes(
            ctx.getGenotypes()
                .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING)));
        builder.attribute(MENDELIAN_VIOLATION_KEY, type.name());

        // Copy over some useful attributes from the full context
        if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY))
          builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
        if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY))
          builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
        if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY))
          builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));

        // Write out the variant record
        familyToViolations.get(trio.FAMILY_ID).add(builder.make());
      }
    }
  }
Exemplo n.º 3
0
  /** Combines multiple SAM/BAM files into one. */
  @Override
  protected int doWork() {
    boolean matchedSortOrders = true;

    // read interval list if it is defined
    final List<Interval> intervalList =
        (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals());
    // map reader->iterator used if INTERVALS is defined
    final Map<SamReader, CloseableIterator<SAMRecord>> samReaderToIterator =
        new HashMap<SamReader, CloseableIterator<SAMRecord>>(INPUT.size());

    // Open the files for reading and writing
    final List<SamReader> readers = new ArrayList<SamReader>();
    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
    {
      SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory

      for (final File inFile : INPUT) {
        IOUtil.assertFileIsReadable(inFile);
        final SamReader in =
            SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile);
        if (INTERVALS != null) {
          if (!in.hasIndex())
            throw new PicardException(
                "Merging with interval but Bam file is not indexed " + inFile);
          final CloseableIterator<SAMRecord> samIterator =
              new SamRecordIntervalIteratorFactory()
                  .makeSamRecordIntervalIterator(in, intervalList, true);
          samReaderToIterator.put(in, samIterator);
        }

        readers.add(in);
        headers.add(in.getFileHeader());

        // A slightly hackish attempt to keep memory consumption down when merging multiple files
        // with
        // large sequence dictionaries (10,000s of sequences). If the dictionaries are identical,
        // then
        // replace the duplicate copies with a single dictionary to reduce the memory footprint.
        if (dict == null) {
          dict = in.getFileHeader().getSequenceDictionary();
        } else if (dict.equals(in.getFileHeader().getSequenceDictionary())) {
          in.getFileHeader().setSequenceDictionary(dict);
        }

        matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
      }
    }

    // If all the input sort orders match the output sort order then just merge them and
    // write on the fly, otherwise setup to merge and sort before writing out the final file
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean presorted;
    final SAMFileHeader.SortOrder headerMergerSortOrder;
    final boolean mergingSamRecordIteratorAssumeSorted;

    if (matchedSortOrders
        || SORT_ORDER == SAMFileHeader.SortOrder.unsorted
        || ASSUME_SORTED
        || INTERVALS != null) {
      log.info(
          "Input files are in same order as output so sorting to temp directory is not needed.");
      headerMergerSortOrder = SORT_ORDER;
      mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED;
      presorted = true;
    } else {
      log.info("Sorting input files using temp directory " + TMP_DIR);
      headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted;
      mergingSamRecordIteratorAssumeSorted = false;
      presorted = false;
    }
    final SamFileHeaderMerger headerMerger =
        new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES);
    final MergingSamRecordIterator iterator;
    // no interval defined, get an iterator for the whole bam
    if (intervalList == null) {
      iterator =
          new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted);
    } else {
      // show warning related to https://github.com/broadinstitute/picard/pull/314/files
      log.info(
          "Warning: merged bams from different interval lists may contain the same read in both files");
      iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true);
    }
    final SAMFileHeader header = headerMerger.getMergedHeader();
    for (final String comment : COMMENT) {
      header.addComment(comment);
    }
    header.setSortOrder(SORT_ORDER);
    final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
    if (USE_THREADING) {
      samFileWriterFactory.setUseAsyncIo(true);
    }
    final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT);

    // Lastly loop through and write out the records
    final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL);
    while (iterator.hasNext()) {
      final SAMRecord record = iterator.next();
      out.addAlignment(record);
      progress.record(record);
    }

    log.info("Finished reading inputs.");
    for (final CloseableIterator<SAMRecord> iter : samReaderToIterator.values())
      CloserUtil.close(iter);
    CloserUtil.close(readers);
    out.close();
    return 0;
  }