@Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); // Setup all the inputs final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci"); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final SamLocusIterator iterator = getLocusIterator(in); final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>(); final CountingFilter dupeFilter = new CountingDuplicateFilter(); final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY); final CountingPairedFilter pairFilter = new CountingPairedFilter(); filters.add(mapqFilter); filters.add(dupeFilter); if (!COUNT_UNPAIRED) { filters.add(pairFilter); } filters.add( new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count // reads twice iterator.setSamFilters(filters); iterator.setEmitUncoveredLoci(true); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); final int max = COVERAGE_CAP; final long[] HistogramArray = new long[max + 1]; final long[] baseQHistogramArray = new long[Byte.MAX_VALUE]; final boolean usingStopAfter = STOP_AFTER > 0; final long stopAfter = STOP_AFTER - 1; long counter = 0; long basesExcludedByBaseq = 0; long basesExcludedByOverlap = 0; long basesExcludedByCapping = 0; // Loop through all the loci while (iterator.hasNext()) { final SamLocusIterator.LocusInfo info = iterator.next(); // Check that the reference is not N final ReferenceSequence ref = refWalker.get(info.getSequenceIndex()); final byte base = ref.getBases()[info.getPosition() - 1]; if (base == 'N') continue; // Figure out the coverage while not counting overlapping reads twice, and excluding various // things final HashSet<String> readNames = new HashSet<String>(info.getRecordAndPositions().size()); int pileupSize = 0; for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) { if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) { ++basesExcludedByBaseq; continue; } if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; } pileupSize++; if (pileupSize <= max) { baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++; } } final int depth = Math.min(readNames.size(), max); if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - max; HistogramArray[depth]++; // Record progress and perhaps stop progress.record(info.getSequenceName(), info.getPosition()); if (usingStopAfter && ++counter > stopAfter) break; } // Construct and write the outputs final Histogram<Integer> histo = new Histogram<Integer>("coverage", "count"); for (int i = 0; i < HistogramArray.length; ++i) { histo.increment(i, HistogramArray[i]); } // Construct and write the outputs final Histogram<Integer> baseQHisto = new Histogram<Integer>("value", "baseq_count"); for (int i = 0; i < baseQHistogramArray.length; ++i) { baseQHisto.increment(i, baseQHistogramArray[i]); } final WgsMetrics metrics = generateWgsMetrics(); metrics.GENOME_TERRITORY = (long) histo.getSumOfValues(); metrics.MEAN_COVERAGE = histo.getMean(); metrics.SD_COVERAGE = histo.getStandardDeviation(); metrics.MEDIAN_COVERAGE = histo.getMedian(); metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation(); final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter); final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter); final long basesExcludedByPairing = getBasesExcludedBy(pairFilter); final double total = histo.getSum(); final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping; metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes; metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes; metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes; metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes; metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes; metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes; metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes; metrics.PCT_1X = MathUtil.sum(HistogramArray, 1, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_5X = MathUtil.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_10X = MathUtil.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_15X = MathUtil.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_20X = MathUtil.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_25X = MathUtil.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_30X = MathUtil.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_40X = MathUtil.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_50X = MathUtil.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_60X = MathUtil.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_70X = MathUtil.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_80X = MathUtil.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_90X = MathUtil.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_100X = MathUtil.sum(HistogramArray, 100, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; final MetricsFile<WgsMetrics, Integer> out = getMetricsFile(); out.addMetric(metrics); out.addHistogram(histo); if (INCLUDE_BQ_HISTOGRAM) { out.addHistogram(baseQHisto); } out.write(OUTPUT); return 0; }
@Override public void accumulate(final VariantContext ctx) { logger.record(ctx.getContig(), ctx.getStart()); final String variantChrom = ctx.getContig(); final int variantPos = ctx.getStart(); // Skip anything a little too funky if (ctx.isFiltered()) return; if (!ctx.isVariant()) return; if (SKIP_CHROMS.contains(variantChrom)) return; for (final MendelianViolationMetrics trio : trios) { final Genotype momGt = ctx.getGenotype(trio.MOTHER); final Genotype dadGt = ctx.getGenotype(trio.FATHER); final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING); // if any genotype: // - has a non-snp allele; or // - lacks a reference allele // // then ignore this trio if (CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .anyMatch( gt -> gt.isHetNonRef() || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream()) .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) { continue; } // if between the trio there are more than 2 alleles including the reference, continue if (Stream.concat( Collections.singleton(ctx.getReference()).stream(), CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .flatMap(gt -> gt.getAlleles().stream())) .collect(Collectors.toSet()) .size() > 2) continue; // Test to make sure: // 1) That the site is in fact variant in the trio // 2) that the offspring doesn't have a really wacky het allele balance if (!isVariant(momGt, dadGt, kidGt)) continue; if (kidGt.isHet()) { final int[] ad = kidGt.getAD(); if (ad == null) continue; final List<Integer> adOfAlleles = kidGt .getAlleles() .stream() .map(a -> ad[ctx.getAlleleIndex(a)]) .collect(Collectors.toList()); final double minAlleleFraction = Math.min(adOfAlleles.get(0), adOfAlleles.get(1)) / (double) (adOfAlleles.get(0) + adOfAlleles.get(1)); if (minAlleleFraction < MIN_HET_FRACTION) continue; } /////////////////////////////////////////////////////////////// // Determine whether the offspring should be haploid at this // locus and which is the parental donor of the haploid genotype /////////////////////////////////////////////////////////////// boolean haploid = false; Genotype haploidParentalGenotype = null; if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) { if (trio.OFFSPRING_SEX == Sex.Female) { // famale haploid = false; } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) { // male but in PAR on X, so diploid haploid = false; } else { // male, out of PAR on X, haploid haploid = true; haploidParentalGenotype = momGt; } } // the PAR on the male chromosome should be masked so that reads // align to the female chromosomes instead, so there's no point // of worrying about that here. if (MALE_CHROMS.contains(variantChrom)) { if (trio.OFFSPRING_SEX == Sex.Male) { haploid = true; haploidParentalGenotype = dadGt; } else { continue; } } // We only want to look at sites where we have high enough confidence that the genotypes we // are looking at are // interesting. We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is // either GQ>=MIN_GQ or in the // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ. if (haploid && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ)) continue; if (!haploid && (momGt.isNoCall() || momGt.getGQ() < MIN_GQ || dadGt.isNoCall() || dadGt.getGQ() < MIN_GQ)) continue; if (kidGt.isNoCall()) continue; if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { if (kidGt.getPL()[0] < MIN_GQ) continue; } else if (kidGt.getGQ() < MIN_GQ) continue; // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too // low if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue; if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP)) continue; trio.NUM_VARIANT_SITES++; /////////////////////////////////////////////////////////////// // First test for haploid violations /////////////////////////////////////////////////////////////// MendelianViolation type = null; if (haploid) { if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) { if (kidGt.isHomRef()) { type = MendelianViolation.Haploid_Other; trio.NUM_HAPLOID_OTHER++; } else { type = MendelianViolation.Haploid_Denovo; trio.NUM_HAPLOID_DENOVO++; } } } /////////////////////////////////////////////////////////////// // Then test for diploid mendelian violations /////////////////////////////////////////////////////////////// else if (isMendelianViolation(momGt, dadGt, kidGt)) { if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { trio.NUM_DIPLOID_DENOVO++; type = MendelianViolation.Diploid_Denovo; } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) { trio.NUM_HOMVAR_HOMVAR_HET++; type = MendelianViolation.HomVar_HomVar_Het; } else if (kidGt.isHom() && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) { trio.NUM_HOMREF_HOMVAR_HOM++; type = MendelianViolation.HomRef_HomVar_Hom; } else if (kidGt.isHom() && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) { trio.NUM_HOM_HET_HOM++; type = MendelianViolation.Hom_Het_Hom; } else { trio.NUM_OTHER++; type = MendelianViolation.Other; } } // Output a record into the family's violation VCF if (type != null) { // Create a new Context subsetted to the three samples final VariantContextBuilder builder = new VariantContextBuilder(ctx); builder.genotypes( ctx.getGenotypes() .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING))); builder.attribute(MENDELIAN_VIOLATION_KEY, type.name()); // Copy over some useful attributes from the full context if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); // Write out the variant record familyToViolations.get(trio.FAMILY_ID).add(builder.make()); } } }
/** Combines multiple SAM/BAM files into one. */ @Override protected int doWork() { boolean matchedSortOrders = true; // read interval list if it is defined final List<Interval> intervalList = (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals()); // map reader->iterator used if INTERVALS is defined final Map<SamReader, CloseableIterator<SAMRecord>> samReaderToIterator = new HashMap<SamReader, CloseableIterator<SAMRecord>>(INPUT.size()); // Open the files for reading and writing final List<SamReader> readers = new ArrayList<SamReader>(); final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); { SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory for (final File inFile : INPUT) { IOUtil.assertFileIsReadable(inFile); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile); if (INTERVALS != null) { if (!in.hasIndex()) throw new PicardException( "Merging with interval but Bam file is not indexed " + inFile); final CloseableIterator<SAMRecord> samIterator = new SamRecordIntervalIteratorFactory() .makeSamRecordIntervalIterator(in, intervalList, true); samReaderToIterator.put(in, samIterator); } readers.add(in); headers.add(in.getFileHeader()); // A slightly hackish attempt to keep memory consumption down when merging multiple files // with // large sequence dictionaries (10,000s of sequences). If the dictionaries are identical, // then // replace the duplicate copies with a single dictionary to reduce the memory footprint. if (dict == null) { dict = in.getFileHeader().getSequenceDictionary(); } else if (dict.equals(in.getFileHeader().getSequenceDictionary())) { in.getFileHeader().setSequenceDictionary(dict); } matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER; } } // If all the input sort orders match the output sort order then just merge them and // write on the fly, otherwise setup to merge and sort before writing out the final file IOUtil.assertFileIsWritable(OUTPUT); final boolean presorted; final SAMFileHeader.SortOrder headerMergerSortOrder; final boolean mergingSamRecordIteratorAssumeSorted; if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted || ASSUME_SORTED || INTERVALS != null) { log.info( "Input files are in same order as output so sorting to temp directory is not needed."); headerMergerSortOrder = SORT_ORDER; mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED; presorted = true; } else { log.info("Sorting input files using temp directory " + TMP_DIR); headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted; mergingSamRecordIteratorAssumeSorted = false; presorted = false; } final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES); final MergingSamRecordIterator iterator; // no interval defined, get an iterator for the whole bam if (intervalList == null) { iterator = new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted); } else { // show warning related to https://github.com/broadinstitute/picard/pull/314/files log.info( "Warning: merged bams from different interval lists may contain the same read in both files"); iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true); } final SAMFileHeader header = headerMerger.getMergedHeader(); for (final String comment : COMMENT) { header.addComment(comment); } header.setSortOrder(SORT_ORDER); final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory(); if (USE_THREADING) { samFileWriterFactory.setUseAsyncIo(true); } final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT); // Lastly loop through and write out the records final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL); while (iterator.hasNext()) { final SAMRecord record = iterator.next(); out.addAlignment(record); progress.record(record); } log.info("Finished reading inputs."); for (final CloseableIterator<SAMRecord> iter : samReaderToIterator.values()) CloserUtil.close(iter); CloserUtil.close(readers); out.close(); return 0; }