/** * Provides the next record from the underlying iterator after applying filter strings generated * by the set of filters in use by the iterator. */ @Override public VariantContext next() { final VariantContext ctx = this.iterator.next(); final Set<String> filterStrings = new HashSet<String>(); // Collect variant level filters for (final VariantFilter filter : this.filters) { final String val = filter.filter(ctx); if (val != null) filterStrings.add(val); } // Collect genotype level filters in a Map of Sample -> List<filter string> final ListMap<String, String> gtFilterStrings = new ListMap<String, String>(); final Set<String> variantSamples = new HashSet<String>(); for (final Genotype gt : ctx.getGenotypes()) { if (gt.isCalled() && !gt.isHomRef()) variantSamples.add(gt.getSampleName()); for (final GenotypeFilter filter : gtFilters) { final String filterString = filter.filter(ctx, gt); if (filterString != null) gtFilterStrings.add(gt.getSampleName(), filterString); } } // If all genotypes are filtered apply a site level filter if (gtFilterStrings.keySet().containsAll(variantSamples)) { filterStrings.add(ALL_GTS_FILTERED); } // Make a builder and set the site level filter appropriately final VariantContextBuilder builder = new VariantContextBuilder(ctx); if (filterStrings.isEmpty()) { builder.passFilters(); } else { builder.filters(filterStrings); } // Apply filters to the necessary genotypes builder.noGenotypes(); final List<Genotype> newGenotypes = new ArrayList<Genotype>(ctx.getNSamples()); for (final Genotype gt : ctx.getGenotypes()) { final GenotypeBuilder gtBuilder = new GenotypeBuilder(gt); final List<String> filtersLocal = gtFilterStrings.get(gt.getSampleName()); if (filtersLocal == null || filtersLocal.isEmpty()) { gtBuilder.filter(PASS_FILTER); } else { gtBuilder.filters(filtersLocal); } newGenotypes.add(gtBuilder.make()); } builder.genotypes(newGenotypes); return builder.make(); }
@Override protected AFCalculationResult computeLog10PNonRef( final VariantContext vc, final int defaultPloidy, final double[] log10AlleleFrequencyPriors, final StateTracker stateTracker) { Utils.nonNull(vc, "vc is null"); Utils.nonNull(log10AlleleFrequencyPriors, "log10AlleleFrequencyPriors is null"); Utils.nonNull(stateTracker, "stateTracker is null"); final int numAlternateAlleles = vc.getNAlleles() - 1; final List<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size() - 1; final int numChr = 2 * numSamples; // queue of AC conformations to process final Deque<ExactACset> ACqueue = new LinkedList<>(); // mapping of ExactACset indexes to the objects final Map<ExactACcounts, ExactACset> indexesToACset = new HashMap<>(numChr + 1); // add AC=0 to the queue final int[] zeroCounts = new int[numAlternateAlleles]; final ExactACset zeroSet = new ExactACset(numSamples + 1, new ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); while (!ACqueue.isEmpty()) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); calculateAlleleCountConformation( set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, stateTracker); // clean up memory indexesToACset.remove(set.getACcounts()); } return getResultFromFinalState(vc, log10AlleleFrequencyPriors, stateTracker); }
@Override public Map<String, Object> annotate( final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) { final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() < MIN_SAMPLES) { if (!warningLogged) { logger.warn("Too few genotypes"); warningLogged = true; } return null; } int refCount = 0; int hetCount = 0; int homCount = 0; for (final Genotype g : genotypes) { if (g.isNoCall()) continue; // TODO - fix me: // Right now we just ignore genotypes that are not confident, but this throws off // our HW ratios. More analysis is needed to determine the right thing to do when // the genotyper cannot decide whether a given sample is het or hom var. if (g.getLog10PError() > MIN_LOG10_PERROR) continue; if (g.isHomRef()) refCount++; else if (g.isHet()) hetCount++; else homCount++; } if (refCount + hetCount + homCount == 0) return null; double pvalue = HardyWeinbergCalculation.hwCalculate(refCount, hetCount, homCount); // System.out.println(refCount + " " + hetCount + " " + homCount + " " + pvalue); Map<String, Object> map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue))); return map; }
/** * Returns a list of attribute values from a VCF file * * @param vcfFile VCF file * @param attributeName attribute name * @throws IOException if the file does not exist or can not be opened * @return list of attribute values */ private List<String> getAttributeValues(final File vcfFile, final String attributeName) throws IOException { final VCFCodec codec = new VCFCodec(); final FileInputStream s = new FileInputStream(vcfFile); final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s)); codec.readHeader(lineIteratorVCF); List<String> attributeValues = new ArrayList<String>(); while (lineIteratorVCF.hasNext()) { final String line = lineIteratorVCF.next(); Assert.assertFalse(line == null); final VariantContext vc = codec.decode(line); for (final Genotype g : vc.getGenotypes()) { if (g.hasExtendedAttribute(attributeName)) { attributeValues.add((String) g.getExtendedAttribute(attributeName)); } } } return attributeValues; }
@Override public void accumulate(final VariantContext ctx) { logger.record(ctx.getContig(), ctx.getStart()); final String variantChrom = ctx.getContig(); final int variantPos = ctx.getStart(); // Skip anything a little too funky if (ctx.isFiltered()) return; if (!ctx.isVariant()) return; if (SKIP_CHROMS.contains(variantChrom)) return; for (final MendelianViolationMetrics trio : trios) { final Genotype momGt = ctx.getGenotype(trio.MOTHER); final Genotype dadGt = ctx.getGenotype(trio.FATHER); final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING); // if any genotype: // - has a non-snp allele; or // - lacks a reference allele // // then ignore this trio if (CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .anyMatch( gt -> gt.isHetNonRef() || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream()) .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) { continue; } // if between the trio there are more than 2 alleles including the reference, continue if (Stream.concat( Collections.singleton(ctx.getReference()).stream(), CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .flatMap(gt -> gt.getAlleles().stream())) .collect(Collectors.toSet()) .size() > 2) continue; // Test to make sure: // 1) That the site is in fact variant in the trio // 2) that the offspring doesn't have a really wacky het allele balance if (!isVariant(momGt, dadGt, kidGt)) continue; if (kidGt.isHet()) { final int[] ad = kidGt.getAD(); if (ad == null) continue; final List<Integer> adOfAlleles = kidGt .getAlleles() .stream() .map(a -> ad[ctx.getAlleleIndex(a)]) .collect(Collectors.toList()); final double minAlleleFraction = Math.min(adOfAlleles.get(0), adOfAlleles.get(1)) / (double) (adOfAlleles.get(0) + adOfAlleles.get(1)); if (minAlleleFraction < MIN_HET_FRACTION) continue; } /////////////////////////////////////////////////////////////// // Determine whether the offspring should be haploid at this // locus and which is the parental donor of the haploid genotype /////////////////////////////////////////////////////////////// boolean haploid = false; Genotype haploidParentalGenotype = null; if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) { if (trio.OFFSPRING_SEX == Sex.Female) { // famale haploid = false; } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) { // male but in PAR on X, so diploid haploid = false; } else { // male, out of PAR on X, haploid haploid = true; haploidParentalGenotype = momGt; } } // the PAR on the male chromosome should be masked so that reads // align to the female chromosomes instead, so there's no point // of worrying about that here. if (MALE_CHROMS.contains(variantChrom)) { if (trio.OFFSPRING_SEX == Sex.Male) { haploid = true; haploidParentalGenotype = dadGt; } else { continue; } } // We only want to look at sites where we have high enough confidence that the genotypes we // are looking at are // interesting. We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is // either GQ>=MIN_GQ or in the // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ. if (haploid && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ)) continue; if (!haploid && (momGt.isNoCall() || momGt.getGQ() < MIN_GQ || dadGt.isNoCall() || dadGt.getGQ() < MIN_GQ)) continue; if (kidGt.isNoCall()) continue; if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { if (kidGt.getPL()[0] < MIN_GQ) continue; } else if (kidGt.getGQ() < MIN_GQ) continue; // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too // low if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue; if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP)) continue; trio.NUM_VARIANT_SITES++; /////////////////////////////////////////////////////////////// // First test for haploid violations /////////////////////////////////////////////////////////////// MendelianViolation type = null; if (haploid) { if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) { if (kidGt.isHomRef()) { type = MendelianViolation.Haploid_Other; trio.NUM_HAPLOID_OTHER++; } else { type = MendelianViolation.Haploid_Denovo; trio.NUM_HAPLOID_DENOVO++; } } } /////////////////////////////////////////////////////////////// // Then test for diploid mendelian violations /////////////////////////////////////////////////////////////// else if (isMendelianViolation(momGt, dadGt, kidGt)) { if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { trio.NUM_DIPLOID_DENOVO++; type = MendelianViolation.Diploid_Denovo; } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) { trio.NUM_HOMVAR_HOMVAR_HET++; type = MendelianViolation.HomVar_HomVar_Het; } else if (kidGt.isHom() && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) { trio.NUM_HOMREF_HOMVAR_HOM++; type = MendelianViolation.HomRef_HomVar_Hom; } else if (kidGt.isHom() && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) { trio.NUM_HOM_HET_HOM++; type = MendelianViolation.Hom_Het_Hom; } else { trio.NUM_OTHER++; type = MendelianViolation.Other; } } // Output a record into the family's violation VCF if (type != null) { // Create a new Context subsetted to the three samples final VariantContextBuilder builder = new VariantContextBuilder(ctx); builder.genotypes( ctx.getGenotypes() .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING))); builder.attribute(MENDELIAN_VIOLATION_KEY, type.name()); // Copy over some useful attributes from the full context if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); // Write out the variant record familyToViolations.get(trio.FAMILY_ID).add(builder.make()); } } }
@Override protected Object doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsWritable(REJECT); //////////////////////////////////////////////////////////////////////// // Setup the inputs //////////////////////////////////////////////////////////////////////// final LiftOver liftOver = new LiftOver(CHAIN); final VCFFileReader in = new VCFFileReader(INPUT, false); logger.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final Map<String, byte[]> refSeqs = new HashMap<>(); for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases()); } CloserUtil.close(walker); //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// final VCFHeader inHeader = in.getFileHeader(); final VCFHeader outHeader = new VCFHeader(inHeader); outHeader.setSequenceDictionary(walker.getSequenceDictionary()); final VariantContextWriter out = new VariantContextWriterBuilder() .setOption(Options.INDEX_ON_THE_FLY) .setOutputFile(OUTPUT) .setReferenceDictionary(walker.getSequenceDictionary()) .build(); out.writeHeader(outHeader); final VariantContextWriter rejects = new VariantContextWriterBuilder() .setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line); rejects.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// long failedLiftover = 0, failedAlleleCheck = 0, total = 0; logger.info("Lifting variants over and sorting."); final SortingCollection<VariantContext> sorter = SortingCollection.newInstance( VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); ProgressLogger progress = new ProgressLogger(logger, 1000000, "read"); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval( ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, 1.0); if (target == null) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make()); failedLiftover++; } else { // Fix the alleles if we went from positive to negative strand final List<Allele> alleles = new ArrayList<>(); for (final Allele oldAllele : ctx.getAlleles()) { if (target.isPositiveStrand() || oldAllele.isSymbolic()) { alleles.add(oldAllele); } else { alleles.add( Allele.create( SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference())); } } // Build the new variant context final VariantContextBuilder builder = new VariantContextBuilder( ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles); builder.id(ctx.getID()); builder.attributes(ctx.getAttributes()); builder.genotypes(ctx.getGenotypes()); builder.filters(ctx.getFilters()); builder.log10PError(ctx.getLog10PError()); // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeqs.get(target.getContig()); final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; } break; } } if (mismatchesReference) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); failedAlleleCheck++; } else { sorter.add(builder.make()); } } progress.record(ctx.getContig(), ctx.getStart()); } final NumberFormat pfmt = new DecimalFormat("0.0000%"); final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total); logger.info("Processed ", total, " variants."); logger.info(Long.toString(failedLiftover), " variants failed to liftover."); logger.info( Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over."); logger.info(pct, " of variants were not successfully lifted over and written to the output."); rejects.close(); in.close(); //////////////////////////////////////////////////////////////////////// // Write the sorted outputs to the final output file //////////////////////////////////////////////////////////////////////// sorter.doneAdding(); progress = new ProgressLogger(logger, 1000000, "written"); logger.info("Writing out sorted records to final VCF."); for (final VariantContext ctx : sorter) { out.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } out.close(); sorter.cleanup(); return null; }