// protected basic manipulation routines private static List<Allele> makeAlleles(Collection<Allele> alleles) { final List<Allele> alleleList = new ArrayList<Allele>(alleles.size()); boolean sawRef = false; for (final Allele a : alleles) { for (final Allele b : alleleList) { if (a.equals(b, true)) throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a); } // deal with the case where the first allele isn't the reference if (a.isReference()) { if (sawRef) throw new IllegalArgumentException( "Alleles for a VariantContext must contain at most one reference allele: " + alleles); alleleList.add(0, a); sawRef = true; } else alleleList.add(a); } if (alleleList.isEmpty()) throw new IllegalArgumentException( "Cannot create a VariantContext with an empty allele list"); if (alleleList.get(0).isNonReference()) throw new IllegalArgumentException( "Alleles for a VariantContext must contain at least one reference allele: " + alleles); return alleleList; }
private static Type typeOfBiallelicVariant(Allele ref, Allele allele) { if (ref.isSymbolic()) throw new IllegalStateException( "Unexpected error: encountered a record with a symbolic reference allele"); if (allele.isSymbolic()) return Type.SYMBOLIC; if (ref.length() == allele.length()) { if (allele.length() == 1) return Type.SNP; else return Type.MNP; } // Important note: previously we were checking that one allele is the prefix of the other. // However, that's not an // appropriate check as can be seen from the following example: // REF = CTTA and ALT = C,CT,CA // This should be assigned the INDEL type but was being marked as a MIXED type because of the // prefix check. // In truth, it should be absolutely impossible to return a MIXED type from this method because // it simply // performs a pairwise comparison of a single alternate allele against the reference allele // (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've // reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. return Type.INDEL; // old incorrect logic: // if (oneIsPrefixOfOther(ref, allele)) // return Type.INDEL; // else // return Type.MIXED; }
protected void printVerboseData( String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { Allele refAllele = null, altAllele = null; for (Allele allele : vc.getAlleles()) { if (allele.isReference()) refAllele = allele; else altAllele = allele; } for (int i = 0; i <= N; i++) { StringBuilder AFline = new StringBuilder("AFINFO\t"); AFline.append(pos); AFline.append("\t"); AFline.append(refAllele); AFline.append("\t"); if (altAllele != null) AFline.append(altAllele); else AFline.append("N/A"); AFline.append("\t"); AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float) i) / N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); verboseWriter.println(AFline.toString()); } verboseWriter.println("P(f>0) = " + PofF); verboseWriter.println("Qscore = " + phredScaledConfidence); verboseWriter.println(); }
@Test public void testFixReverseComplementedGenotypes() { final Allele refA = Allele.create("A", true); final Allele altC = Allele.create("C", false); final GenotypesContext originalGenotypes = GenotypesContext.create(3); originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make()); originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make()); originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make()); final Allele refT = Allele.create("T", true); final Allele altG = Allele.create("G", false); final GenotypesContext expectedGenotypes = GenotypesContext.create(3); expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make()); expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make()); expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make()); final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(2); reverseComplementAlleleMap.put(refA, refT); reverseComplementAlleleMap.put(altC, altG); final GenotypesContext actualGenotypes = LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap); for (final String sample : Arrays.asList("homref", "het", "homvar")) { final List<Allele> expected = expectedGenotypes.get(sample).getAlleles(); final List<Allele> actual = actualGenotypes.get(sample).getAlleles(); Assert.assertEquals(expected.get(0), actual.get(0)); Assert.assertEquals(expected.get(1), actual.get(1)); } }
protected final void printCallInfo( final VariantContext vc, final double[] log10AlleleFrequencyPriors, final long runtimeNano, final AFCalcResult result) { printCallElement(vc, "type", "ignore", vc.getType()); int allelei = 0; for (final Allele a : vc.getAlleles()) printCallElement(vc, "allele", allelei++, a.getDisplayString()); for (final Genotype g : vc.getGenotypes()) printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++) printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); printCallElement(vc, "runtime.nano", "ignore", runtimeNano); printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0()); printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0()); for (final Allele allele : result.getAllelesUsedInGenotyping()) { if (allele.isNonReference()) { printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); printCallElement( vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); } } callReport.flush(); }
public boolean hasSymbolicAlleles() { for (final Allele a : getAlleles()) { if (a.isSymbolic()) { return true; } } return false; }
/** * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding * NO_CALLS) * * @return chromosome count */ public int getCalledChrCount() { int n = 0; for (final Genotype g : getGenotypes()) { for (final Allele a : g.getAlleles()) n += a.isNoCall() ? 0 : 1; } return n; }
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) { int index = 1; for (Allele allele : getAlternateAlleles()) { if (allele.equals(targetAllele)) break; index++; } return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index); }
public boolean hasAllele(Allele allele, boolean ignoreRefState) { if (allele == REF || allele == ALT) // optimization for cached cases return true; for (Allele a : getAlleles()) { if (a.equals(allele, ignoreRefState)) return true; } return false; }
private Collection<VariantContext> getVariantContexts( RefMetaDataTracker tracker, ReferenceContext ref) { List<Feature> features = tracker.getValues(variants, ref.getLocus()); List<VariantContext> VCs = new ArrayList<VariantContext>(features.size()); for (Feature record : features) { if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) { // we need to special case the HapMap format because indels aren't handled correctly if (record instanceof RawHapMapFeature) { // is it an indel? RawHapMapFeature hapmap = (RawHapMapFeature) record; if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) { // get the dbsnp object corresponding to this record (needed to help us distinguish // between insertions and deletions) VariantContext dbsnpVC = getDbsnp(hapmap.getName()); if (dbsnpVC == null || dbsnpVC.isMixed()) continue; Map<String, Allele> alleleMap = new HashMap<String, Allele>(2); alleleMap.put( RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion())); alleleMap.put( RawHapMapFeature.INSERTION, Allele.create( (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions hapmap.updatePosition(dbsnpVC.getStart()); if (hapmap.getStart() < ref.getWindow().getStart()) { logger.warn( "Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); continue; } } } // ok, we might actually be able to turn this record in a variant context VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref); if (vc != null) // sometimes the track has odd stuff in it that can't be converted VCs.add(vc); } } return VCs; }
private void validateAlleles() { // check alleles boolean alreadySeenRef = false, alreadySeenNull = false; for (Allele allele : alleles) { // make sure there's only one reference allele if (allele.isReference()) { if (alreadySeenRef) throw new IllegalArgumentException( "BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); alreadySeenRef = true; } if (allele.isNoCall()) { throw new IllegalArgumentException( "BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this); } // make sure there's only one null allele if (allele.isNull()) { if (alreadySeenNull) throw new IllegalArgumentException( "BUG: Received two null alleles in VariantContext " + alleles + " this=" + this); alreadySeenNull = true; } } // make sure there's one reference allele if (!alreadySeenRef) throw new IllegalArgumentException("No reference allele found in VariantContext"); // if ( getType() == Type.INDEL ) { // if ( getReference().length() != (getLocation().size()-1) ) { long length = (stop - start) + 1; if ((getReference().isNull() && length != 1) || (getReference().isNonNull() && (length - getReference().length() > 1))) { throw new IllegalStateException( "BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); } }
/** * Gets the sizes of the alternate alleles if they are insertion/deletion events, and returns a * list of their sizes * * @return a list of indel lengths ( null if not of type indel or mixed ) */ public List<Integer> getIndelLengths() { if (getType() != Type.INDEL && getType() != Type.MIXED) { return null; } List<Integer> lengths = new ArrayList<Integer>(); for (Allele a : getAlternateAlleles()) { lengths.add(a.length() - getReference().length()); } return lengths; }
private void validateGenotypes() { if (this.genotypes == null) throw new IllegalStateException("Genotypes is null"); for (final Genotype g : this.genotypes) { if (g.isAvailable()) { for (Allele gAllele : g.getAlleles()) { if (!hasAllele(gAllele) && gAllele.isCalled()) throw new IllegalStateException( "Allele in genotype " + gAllele + " not in the variant context " + alleles); } } } }
/** * helper routine for subcontext * * @param genotypes genotypes * @return allele set */ private final Set<Allele> allelesOfGenotypes(Collection<Genotype> genotypes) { final Set<Allele> alleles = new HashSet<Allele>(); boolean addedref = false; for (final Genotype g : genotypes) { for (final Allele a : g.getAlleles()) { addedref = addedref || a.isReference(); if (a.isCalled()) alleles.add(a); } } if (!addedref) alleles.add(getReference()); return alleles; }
private ReverseClippingPositionTestProvider( final int expectedClip, final String ref, final String... alleles) { super(ReverseClippingPositionTestProvider.class); this.ref = ref; for (final String allele : alleles) this.alleles.add(Allele.create(allele)); this.expectedClip = expectedClip; }
private VariantCallContext generateEmptyContext( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) { VariantContext vc; if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod( tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if (vcInput == null) return null; vc = new VariantContextBuilder( "UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()) .make(); } else { // deal with bad/non-standard reference bases if (!Allele.acceptableAlleleBases(new byte[] {ref.getBase()})) return null; Set<Allele> alleles = new HashSet<Allele>(); alleles.add(Allele.create(ref.getBase(), true)); vc = new VariantContextBuilder( "UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles) .make(); } if (annotationEngine != null) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); } return new VariantCallContext(vc, false); }
private static final boolean hasPLIncompatibleAlleles( final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) { final Iterator<Allele> it1 = alleleSet1.iterator(); final Iterator<Allele> it2 = alleleSet2.iterator(); while (it1.hasNext() && it2.hasNext()) { final Allele a1 = it1.next(); final Allele a2 = it2.next(); if (!a1.equals(a2)) return true; } // by this point, at least one of the iterators is empty. All of the elements // we've compared are equal up until this point. But it's possible that the // sets aren't the same size, which is indicated by the test below. If they // are of the same size, though, the sets are compatible return it1.hasNext() || it2.hasNext(); }
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate return true; } } return false; }
private RepeatDetectorTest( boolean isTrueRepeat, String ref, String refAlleleString, String... altAlleleStrings) { super(RepeatDetectorTest.class); this.ref = "N" + ref; // add a dummy base for the event here this.isTrueRepeat = isTrueRepeat; List<Allele> alleles = new LinkedList<Allele>(); final Allele refAllele = Allele.create(refAlleleString, true); alleles.add(refAllele); for (final String altString : altAlleleStrings) { final Allele alt = Allele.create(altString, false); alleles.add(alt); } VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles); this.vc = builder.make(); }
private Allele ensureMergedAllele( Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); Allele mergedAllele = mergedAlleles.get(all12); if (mergedAllele == null) { byte[] bases1 = all1.getBases(); byte[] bases2 = all2.getBases(); byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); if (intermediateBases != null) System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); mergedAlleles.put(all12, mergedAllele); } return mergedAllele; }
/** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * * @param vc variant context * @return new vc */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele HashMap<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(vc.getAlleles().size()); for (Allele originalAllele : vc.getAlleles()) { Allele newAllele; if (originalAllele.isNoCall() || originalAllele.isNull()) newAllele = originalAllele; else newAllele = Allele.create( BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); alleleMap.put(originalAllele, newAllele); } // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for (final Genotype genotype : vc.getGenotypes()) { List<Allele> newAlleles = new ArrayList<Allele>(); for (Allele allele : genotype.getAlleles()) { Allele newAllele = alleleMap.get(allele); if (newAllele == null) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); }
public void validateReferenceBases(Allele reference, Byte paddedRefBase) { if (reference == null) return; // don't validate if we're a complex event if (!isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference())) { throw new TribbleException.InternalCodecException( String.format( "the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); } // we also need to validate the padding base for simple indels if (hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase)) { throw new TribbleException.InternalCodecException( String.format( "the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char) paddedRefBase.byteValue(), (char) getReferenceBaseForIndel().byteValue())); } }
static boolean doubleAllelesSegregatePerfectlyAmongSamples( VariantContext vc1, VariantContext vc2) { // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including // reference): Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>(); Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>(); // Note the segregation of the alleles for the reference genome: allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); // Note the segregation of the alleles for each sample (and check that it is consistent with the // reference and all previous samples). for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); Allele all1To2 = allele1ToAllele2.get(all1); if (all1To2 == null) allele1ToAllele2.put(all1, all2); else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 return false; Allele all2To1 = allele2ToAllele1.get(all2); if (all2To1 == null) allele2ToAllele1.put(all2, all1); else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 return false; } } return true; }
/** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } }
private static Allele determineReferenceAllele(List<VariantContext> VCs) { Allele ref = null; for (VariantContext vc : VCs) { Allele myRef = vc.getReference(); if (ref == null || ref.length() < myRef.length()) ref = myRef; else if (ref.length() == myRef.length() && !ref.equals(myRef)) throw new UserException.BadInput( String.format( "The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); } return ref; }
@BeforeSuite public void setup() { final File referenceFile = new File(b37KGReference); try { IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); } catch (FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(referenceFile, ex); } // alleles Aref = Allele.create("A", true); Cref = Allele.create("C", true); T = Allele.create("T"); C = Allele.create("C"); ATC = Allele.create("ATC"); ATCATC = Allele.create("ATCATC"); }
/** * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep * or all sites (if this is empty) * * @param reader a just-opened reader sitting at the start of the file * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should * be kept * @param parser a genome loc parser to create genome locs * @return a list of ExactCall objects in reader * @throws IOException */ public static List<ExactCall> readExactLog( final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException { if (reader == null) throw new IllegalArgumentException("reader cannot be null"); if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null"); if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null"); List<ExactCall> calls = new LinkedList<ExactCall>(); // skip the header line reader.readLine(); // skip the first "type" line reader.readLine(); while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List<Allele> alleles = new ArrayList<Allele>(); final List<Genotype> genotypes = new ArrayList<Genotype>(); final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true); final List<Integer> mle = new ArrayList<Integer>(); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(); long runtimeNano = -1; GenomeLoc currentLoc = null; while (true) { final String line = reader.readLine(); if (line == null) return calls; final String[] parts = line.split("\t"); final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); final String variable = parts[1]; final String key = parts[2]; final String value = parts[3]; if (currentLoc == null) currentLoc = lineLoc; if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {})); final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { final boolean isRef = key.equals("0"); alleles.add(Allele.create(value, isRef)); } else if (variable.equals("PL")) { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); } else if (variable.equals("log10PosteriorOfAFEq0")) { posteriors[0] = Double.valueOf(value); } else if (variable.equals("log10PosteriorOfAFGt0")) { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); } else if (variable.equals("pNonRefByAllele")) { final Allele a = Allele.create(key); log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { // nothing to do } } } }
public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); }
/** * Analyze coverage distribution and validate read mates per interval and per sample * * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It * analyzes each sample independently and aggregates results over intervals of interest. * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered * argument. * * <h3>Input</h3> * * <ul> * <li>A reference file * <li>one or more input BAMs * <li>One or more intervals * </ul> * * <h3>Output</h3> * * <p>A modified VCF detailing each interval by sample and information for each interval according * to the thresholds used. Interval information includes GC Content, average interval depth, * callable status among others. If you use the --missing option, you can get as a second output a * intervals file with the loci that have missing data. This file can then be used as input to * QualifyMissingIntervals for full qualification and interpretation of why the data is missing. * * <h3>Usage example</h3> * * <pre> * java -jar GenomeAnalysisTK.jar * -T DiagnoseTargets \ * -R reference.fasta \ * -I sample1.bam \ * -I sample2.bam \ * -I sample3.bam \ * -L intervals.interval_list \ * -o output.vcf * </pre> * * @author Mauricio Carneiro, Roger Zurawicki * @since 5/8/12 */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}) @By(value = DataSource.READS) @PartitionBy(PartitionType.INTERVAL) @Downsample(by = DownsampleType.NONE) public class DiagnoseTargets extends LocusWalker<Long, Long> { @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the // genome private Set<String> samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create( "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals @Override public void initialize() { super.initialize(); if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException( "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // pre load all the statistics classes because it is costly to operate on the JVM and we only // want to do it once. loadAllPlugins(thresholds); } @Override public Long map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); // process and remove any intervals in the map that are don't overlap the current locus anymore // and add all new intervals that may overlap this reference locus addNewOverlappingIntervals(refLocus); outputFinishedIntervals(refLocus, ref.getBase()); // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStratification intervalStratification : intervalMap.values()) intervalStratification.addLocus(context, ref); return 1L; } @Override public Long reduceInit() { return 0L; } /** * Not sure what we are going to do here * * @param value result of the map. * @param sum accumulator for the reduce. * @return a long */ @Override public Long reduce(Long value, Long sum) { return sum + value; } /** * Process all remaining intervals * * @param result number of loci processed by the walker */ @Override public void onTraversalDone(final Long result) { for (GenomeLoc interval : intervalMap.keySet()) outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); GenomeLoc interval = intervalListIterator.peek(); while (interval != null) { outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); intervalListIterator.next(); interval = intervalListIterator.peek(); } if (thresholds.missingTargets != null) { thresholds.missingTargets.close(); } } /** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } } /** * Adds all intervals that overlap the current reference locus to the intervalMap * * @param refLocus the current reference locus */ private void addNewOverlappingIntervals(final GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); interval = intervalListIterator.peek(); } } /** * Takes the interval, finds it in the stash, prints it to the VCF * * @param stats The statistics of the interval * @param refAllele the reference allele */ private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); final List<Allele> alleles = new ArrayList<>(); final Map<String, Object> attributes = new HashMap<>(); final ArrayList<Genotype> genotypes = new ArrayList<>(); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStratification sampleStat = stats.getSampleStatistics(sample); gb.attribute( GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size())); gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder( "DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); } private boolean hasMissingStatuses(AbstractStratification stats) { return !stats.callableStatuses().isEmpty(); } private boolean hasMissingLoci(final IntervalStratification stats) { return thresholds.missingTargets != null && hasMissingStatuses(stats); } private void outputMissingInterval(final IntervalStratification stats) { final GenomeLoc interval = stats.getInterval(); final boolean missing[] = new boolean[interval.size()]; Arrays.fill(missing, true); for (AbstractStratification sample : stats.getElements()) { if (hasMissingStatuses(sample)) { int pos = 0; for (AbstractStratification locus : sample.getElements()) { if (locus.callableStatuses().isEmpty()) { missing[pos] = false; } pos++; } } } int start = -1; boolean insideMissing = false; for (int i = 0; i < missing.length; i++) { if (missing[i] && !insideMissing) { start = interval.getStart() + i; insideMissing = true; } else if (!missing[i] && insideMissing) { final int stop = interval.getStart() + i - 1; outputMissingInterval(interval.getContig(), start, stop); insideMissing = false; } } if (insideMissing) { outputMissingInterval(interval.getContig(), start, interval.getStop()); } } private void outputMissingInterval(final String contig, final int start, final int stop) { final PrintStream out = thresholds.missingTargets; out.println(String.format("%s:%d-%d", contig, start, stop)); } /** * Function that process a set of statuses into strings * * @param statuses the set of statuses to be converted * @return a matching set of strings */ private List<String> statusToStrings( Iterable<CallableStatus> statuses, final boolean isInfoField) { List<String> output = new LinkedList<>(); for (CallableStatus status : statuses) if (isInfoField || status != CallableStatus.PASS) output.add(status.name()); return output; } private IntervalStratification createIntervalStatistic(GenomeLoc interval) { return new IntervalStratification(samples, interval, thresholds); } protected static void loadAllPlugins(final ThresHolder thresholds) { for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) { try { final LocusMetric stats = (LocusMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.locusMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) { try { final SampleMetric stats = (SampleMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.sampleMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) { try { final IntervalMetric stats = (IntervalMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.intervalMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } } /** * Gets the header lines for the VCF writer * * @return A set of VCF header lines */ private static Set<VCFHeaderLine> getHeaderInfo() { Set<VCFHeaderLine> headerLines = new HashSet<>(); // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY)); headerLines.add( new VCFInfoHeaderLine( "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); headerLines.add( GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI)); // FILTER fields for (CallableStatus stat : CallableStatus.values()) headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); return headerLines; } }
/** * @return The allele sharing the same bases as this byte[], or null if no such allele is present. */ public Allele getAllele(byte[] allele) { return Allele.getMatchingAllele(getAlleles(), allele); }