private void purgeQueue() { final ReferenceContext refContext = queue.getFirst().ref; // divide them up by source while (!queue.isEmpty()) { VCcontext context = queue.removeFirst(); for (final VariantContext vc : context.vcs) { if (vc.getSource().equals(source1)) sourceVCs1.add(vc); else sourceVCs2.add(vc); } } writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS); if (sourceVCs1.isEmpty()) { writeAll(sourceVCs2, source2, null); } else if (sourceVCs2.isEmpty()) { writeAll(sourceVCs1, source1, null); } else { resolveByHaplotype(refContext); } // allow for GC of the data sourceVCs1.clear(); sourceVCs2.clear(); }
@Test public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator( new AsciiLineReader(new FileInputStream(variantTestDataRoot + "HiSeq.10000.vcf"))); final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); Assert.assertEquals( header.getNGenotypeSamples(), 1, "Wrong number of samples in remapped header"); Assert.assertEquals( header.getGenotypeSamples().get(0), "FOOSAMPLE", "Sample name in remapped header has incorrect value"); int recordCount = 0; while (vcfIterator.hasNext() && recordCount < 10) { recordCount++; final VariantContext vcfRecord = codec.decode(vcfIterator.next()); Assert.assertEquals( vcfRecord.getSampleNames().size(), 1, "Wrong number of samples in vcf record after remapping"); Assert.assertEquals( vcfRecord.getSampleNames().iterator().next(), "FOOSAMPLE", "Wrong sample in vcf record after remapping"); } }
private VariantContext getDbsnp(String rsID) { if (dbsnpIterator == null) { if (dbsnp == null) throw new UserException.BadInput( "No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records"); RMDTrackBuilder builder = new RMDTrackBuilder( getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), getToolkit().getGenomeLocParser(), getToolkit().getArguments().unsafe, getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods, null); dbsnpIterator = builder .createInstanceOfTrack(VCFCodec.class, new File(dbsnp.dbsnp.getSource())) .getIterator(); // Note that we should really use some sort of seekable iterator here so that the search // doesn't take forever // (but it's complicated because the hapmap location doesn't match the dbsnp location, so we // don't know where to seek to) } while (dbsnpIterator.hasNext()) { GATKFeature feature = dbsnpIterator.next(); VariantContext vc = (VariantContext) feature.getUnderlyingObject(); if (vc.getID().equals(rsID)) return vc; } return null; }
/** * For each variant in the file, determine the phasing for the child and replace the child's * genotype with the trio's genotype * * @param tracker the reference meta-data tracker * @param ref the reference context * @param context the alignment context * @return null */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker != null) { for (VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { vc = vc.subContextFromSamples(samples); if (!vc.isPolymorphicInSamples()) continue; double log10pSomatic = calcLog10pSomatic(vc); // write in the somatic status probability Map<String, Object> attrs = new HashMap<String, Object>(); // vc.getAttributes()); if (!minimalVCF) attrs.putAll(vc.getAttributes()); attrs.put(SOMATIC_LOD_TAG_NAME, log10pSomatic); if (log10pSomatic > somaticMinLOD) { attrs.put(VCFConstants.SOMATIC_KEY, true); attrs.put(SOMATIC_NONREF_TAG_NAME, calculateTumorNNR(vc)); attrs.put(SOMATIC_AC_TAG_NAME, calculateTumorAC(vc)); } final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(attrs); VariantContextUtils.calculateChromosomeCounts(builder, false); VariantContext newvc = builder.make(); vcfWriter.add(newvc); } return null; } return null; }
private boolean determineAndWriteOverlap( final VariantContext vc1, final VariantContext vc2, final String status) { final int allelesFrom1In2 = findOverlap(vc1, vc2); final int allelesFrom2In1 = findOverlap(vc2, vc1); final int totalAllelesIn1 = vc1.getAlternateAlleles().size(); final int totalAllelesIn2 = vc2.getAlternateAlleles().size(); final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1; final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2; boolean thereIsOverlap = true; if (allAllelesFrom1Overlap && allAllelesFrom2Overlap) { writeOne(vc1, INTERSECTION_SET, status); } else if (allAllelesFrom1Overlap) { writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2); } else if (allAllelesFrom2Overlap) { writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1); } else if (allelesFrom1In2 > 0) { writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS); } else if (totalAllelesIn1 > 1 || totalAllelesIn2 > 1) { // we don't handle multi-allelics in the haplotype-based reconstruction writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS); } else { thereIsOverlap = false; } return thereIsOverlap; }
private static int findOverlap(final VariantContext target, final VariantContext comparison) { int overlap = 0; for (final Allele allele : target.getAlternateAlleles()) { if (comparison.hasAlternateAllele(allele)) overlap++; } return overlap; }
private static void testVCsAreEqual( final List<VariantContext> VCs1, final List<VariantContext> VCs2) { Assert.assertEquals(VCs1.size(), VCs2.size(), "number of Variant Contexts"); for (int i = 0; i < VCs1.size(); i++) { final VariantContext vc1 = VCs1.get(i); final VariantContext vc2 = VCs2.get(i); Assert.assertEquals(vc1.toStringDecodeGenotypes(), vc2.toStringDecodeGenotypes()); } }
@Override public VariantContext next() { try { final VariantContext vc = codec.decode(source); return vc == null ? null : vc.fullyDecode(header, false); } catch (IOException e) { throw new RuntimeException(e); } }
private Collection<VariantContext> getVariantContexts( RefMetaDataTracker tracker, ReferenceContext ref) { List<Feature> features = tracker.getValues(variants, ref.getLocus()); List<VariantContext> VCs = new ArrayList<VariantContext>(features.size()); for (Feature record : features) { if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) { // we need to special case the HapMap format because indels aren't handled correctly if (record instanceof RawHapMapFeature) { // is it an indel? RawHapMapFeature hapmap = (RawHapMapFeature) record; if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) { // get the dbsnp object corresponding to this record (needed to help us distinguish // between insertions and deletions) VariantContext dbsnpVC = getDbsnp(hapmap.getName()); if (dbsnpVC == null || dbsnpVC.isMixed()) continue; Map<String, Allele> alleleMap = new HashMap<String, Allele>(2); alleleMap.put( RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion())); alleleMap.put( RawHapMapFeature.INSERTION, Allele.create( (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions hapmap.updatePosition(dbsnpVC.getStart()); if (hapmap.getStart() < ref.getWindow().getStart()) { logger.warn( "Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); continue; } } } // ok, we might actually be able to turn this record in a variant context VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref); if (vc != null) // sometimes the track has odd stuff in it that can't be converted VCs.add(vc); } } return VCs; }
/** * Provides the next record from the underlying iterator after applying filter strings generated * by the set of filters in use by the iterator. */ @Override public VariantContext next() { final VariantContext ctx = this.iterator.next(); final Set<String> filterStrings = new HashSet<String>(); // Collect variant level filters for (final VariantFilter filter : this.filters) { final String val = filter.filter(ctx); if (val != null) filterStrings.add(val); } // Collect genotype level filters in a Map of Sample -> List<filter string> final ListMap<String, String> gtFilterStrings = new ListMap<String, String>(); final Set<String> variantSamples = new HashSet<String>(); for (final Genotype gt : ctx.getGenotypes()) { if (gt.isCalled() && !gt.isHomRef()) variantSamples.add(gt.getSampleName()); for (final GenotypeFilter filter : gtFilters) { final String filterString = filter.filter(ctx, gt); if (filterString != null) gtFilterStrings.add(gt.getSampleName(), filterString); } } // If all genotypes are filtered apply a site level filter if (gtFilterStrings.keySet().containsAll(variantSamples)) { filterStrings.add(ALL_GTS_FILTERED); } // Make a builder and set the site level filter appropriately final VariantContextBuilder builder = new VariantContextBuilder(ctx); if (filterStrings.isEmpty()) { builder.passFilters(); } else { builder.filters(filterStrings); } // Apply filters to the necessary genotypes builder.noGenotypes(); final List<Genotype> newGenotypes = new ArrayList<Genotype>(ctx.getNSamples()); for (final Genotype gt : ctx.getGenotypes()) { final GenotypeBuilder gtBuilder = new GenotypeBuilder(gt); final List<String> filtersLocal = gtFilterStrings.get(gt.getSampleName()); if (filtersLocal == null || filtersLocal.isEmpty()) { gtBuilder.filter(PASS_FILTER); } else { gtBuilder.filters(filtersLocal); } newGenotypes.add(gtBuilder.make()); } builder.genotypes(newGenotypes); return builder.make(); }
private VariantContext getMatchingSnpEffRecord( List<VariantContext> snpEffRecords, VariantContext vc) { for (VariantContext snpEffRecord : snpEffRecords) { if (snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference())) { return snpEffRecord; } } return null; }
@Test public void shouldPreserveSymbolicAlleleCase() { VCFFileReader reader = new VCFFileReader(new File(VariantBaseTest.variantTestDataRoot + "breakpoint.vcf"), false); VariantContext variant = reader.iterator().next(); reader.close(); // VCF v4.1 s1.4.5 // Tools processing VCF files are not required to preserve case in the allele String, except for // IDs, which are case sensitive. Assert.assertTrue(variant.getAlternateAllele(0).getDisplayString().contains("chr12")); }
// this method is intended to reconcile uniquified sample names // it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples // because founderIds // is derived from the sampleDB, which comes from the input sample names, but vc will have // uniquified (i.e. different) // sample names. Without this check, the founderIds won't be found in the vc and the annotation // won't be calculated. protected static Set<String> validateFounderIDs( final Set<String> founderIds, final VariantContext vc) { Set<String> vcSamples = new HashSet<>(); Set<String> returnIDs = founderIds; vcSamples.addAll(vc.getSampleNames()); if (!vcSamples.isEmpty()) { if (founderIds != null) { vcSamples.removeAll(founderIds); if (vcSamples.equals(vc.getSampleNames())) returnIDs = vc.getSampleNames(); } } return returnIDs; }
private List<SnpEffEffect> parseSnpEffRecord(VariantContext snpEffRecord) { List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>(); Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY); if (effectFieldValue == null) { return parsedEffects; } // The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a // String. // We can have either in the case of SnpEff, since there may be one or more than one effect in // this record. List<String> individualEffects; if (effectFieldValue instanceof List) { individualEffects = (List<String>) effectFieldValue; } else { individualEffects = Arrays.asList((String) effectFieldValue); } for (String effectString : individualEffects) { String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER); if (effectNameAndMetadata.length != 2) { logger.warn( String.format( "Malformed SnpEff effect field at %s:%d, skipping: %s", snpEffRecord.getChr(), snpEffRecord.getStart(), effectString)); continue; } String effectName = effectNameAndMetadata[0]; String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1); SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata); if (parsedEffect.isWellFormed()) { parsedEffects.add(parsedEffect); } else { logger.warn( String.format( "Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"", snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString)); } } return parsedEffects; }
private void writeDifferences( final List<VariantContext> source1Alleles, final List<VariantContext> source2Alleles) { int currentIndex1 = 0, currentIndex2 = 0; final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); VariantContext current1 = source1Alleles.get(0); VariantContext current2 = source2Alleles.get(0); while (currentIndex1 < size1 || currentIndex2 < size2) { if (current1 == null) { writeOne(current2, source2, null); currentIndex2++; current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null); } else if (current2 == null) { writeOne(current1, source1, null); currentIndex1++; current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null); } else { final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); if (loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2)) { String status; if (loc1.getStart() == loc2.getStart()) { final String allele1 = current1.getAlternateAllele(0).getBaseString(); final String allele2 = current2.getAlternateAllele(0).getBaseString(); if (allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1) status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; else status = SAME_START_DIFFERENT_ALLELES_STATUS; } else { status = OVERLAPPING_EVENTS_STATUS; } writeOne(current1, INTERSECTION_SET, status); currentIndex1++; currentIndex2++; current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null); current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null); } else if (loc1.isBefore(loc2)) { writeOne(current1, source1, null); currentIndex1++; current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null); } else { writeOne(current2, source2, null); currentIndex2++; current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null); } } } }
protected void noteCurrentRecord(VariantContext vc) { super.noteCurrentRecord(vc); // first, check for errors // then, update mostUpstreamWritableLoc: int mostUpstreamWritableIndex = vc.getStart() - maxCachingStartDistance; this.mostUpstreamWritableLoc = Math.max(BEFORE_MOST_UPSTREAM_LOC, mostUpstreamWritableIndex); }
@Override protected AFCalculationResult computeLog10PNonRef( final VariantContext vc, final int defaultPloidy, final double[] log10AlleleFrequencyPriors, final StateTracker stateTracker) { Utils.nonNull(vc, "vc is null"); Utils.nonNull(log10AlleleFrequencyPriors, "log10AlleleFrequencyPriors is null"); Utils.nonNull(stateTracker, "stateTracker is null"); final int numAlternateAlleles = vc.getNAlleles() - 1; final List<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size() - 1; final int numChr = 2 * numSamples; // queue of AC conformations to process final Deque<ExactACset> ACqueue = new LinkedList<>(); // mapping of ExactACset indexes to the objects final Map<ExactACcounts, ExactACset> indexesToACset = new HashMap<>(numChr + 1); // add AC=0 to the queue final int[] zeroCounts = new int[numAlternateAlleles]; final ExactACset zeroSet = new ExactACset(numSamples + 1, new ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); while (!ACqueue.isEmpty()) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); calculateAlleleCountConformation( set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, stateTracker); // clean up memory indexesToACset.remove(set.getACcounts()); } return getResultFromFinalState(vc, log10AlleleFrequencyPriors, stateTracker); }
private double log10PLFromSamples( final VariantContext vc, final String sample, boolean calcRefP) { Genotype g = vc.getGenotype(sample); double log10pSample = -1000; if (!g.isNoCall()) { final double[] gLikelihoods = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); log10pSample = Math.log10(calcRefP ? gLikelihoods[0] : 1 - gLikelihoods[0]); log10pSample = Double.isInfinite(log10pSample) ? -10000 : log10pSample; } return log10pSample; }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker == null || !BaseUtils.isRegularBase(ref.getBase())) return 0; Collection<VariantContext> contexts = getVariantContexts(tracker, ref); for (VariantContext vc : contexts) { VariantContextBuilder builder = new VariantContextBuilder(vc); // set the appropriate sample name if necessary if (sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName())) { Genotype g = new GenotypeBuilder(vc.getGenotype(variants.getName())).name(sampleName).make(); builder.genotypes(g); } final VariantContext withID = variantOverlapAnnotator.annotateRsID(tracker, builder.make()); writeRecord(withID, tracker, ref.getLocus()); } return 1; }
private void writeAndPurgeAllEqualVariants( final List<VariantContext> sourceVCs1, final List<VariantContext> sourceVCs2, final String status) { int currentIndex1 = 0, currentIndex2 = 0; int size1 = sourceVCs1.size(), size2 = sourceVCs2.size(); VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1) : null); VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2) : null); while (current1 != null && current2 != null) { final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); if (loc1.equals(loc2) || (loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1))) { // test the alleles if (determineAndWriteOverlap(current1, current2, status)) { sourceVCs1.remove(currentIndex1); sourceVCs2.remove(currentIndex2); size1--; size2--; } else { currentIndex1++; currentIndex2++; } current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1) : null); current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2) : null); } else if (loc1.isBefore(loc2)) { currentIndex1++; current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1) : null); } else { currentIndex2++; current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2) : null); } } }
private byte[] generateHaplotype( final List<VariantContext> sourceVCs, final ReferenceContext refContext) { final StringBuilder sb = new StringBuilder(); final int startPos = refContext.getWindow().getStart(); int currentPos = startPos; final byte[] reference = refContext.getBases(); for (final VariantContext vc : sourceVCs) { // add any missing reference context int vcStart = vc.getStart(); final int refAlleleLength = vc.getReference().length(); if (refAlleleLength == vc.getEnd() - vc.getStart()) // this is a deletion (whereas for other events the padding base // isn't part of the position) vcStart++; while (currentPos < vcStart) sb.append((char) reference[currentPos++ - startPos]); // add the alt allele sb.append(vc.getAlternateAllele(0).getBaseString()); // skip the reference allele currentPos += refAlleleLength; } // add any missing reference context final int stopPos = refContext.getWindow().getStop(); while (currentPos < stopPos) sb.append((char) reference[currentPos++ - startPos]); return sb.toString().getBytes(); }
@Override protected void doWork(String inputSource, VcfIterator r, VariantContextWriter w) throws IOException { VCFHeader header = r.getHeader(); VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder()); h2.addMetaDataLine( new VCFInfoHeaderLine( TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "metadata added from " + TABIX + " . Format was " + FORMAT)); h2.addMetaDataLine( new VCFHeaderLine( getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine()))); h2.addMetaDataLine( new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion()))); h2.addMetaDataLine( new VCFHeaderLine( getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion())); h2.addMetaDataLine( new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome())); SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header); w.writeHeader(h2); while (r.hasNext()) { VariantContext ctx = progress.watch(r.next()); Set<String> annotations = new HashSet<String>(); CloseableIterator<BedLine> iter = this.bedReader.iterator(ctx.getContig(), ctx.getStart() - 1, ctx.getEnd() + 1); while (iter.hasNext()) { BedLine bedLine = iter.next(); if (!ctx.getContig().equals(bedLine.getContig())) continue; if (ctx.getStart() - 1 >= bedLine.getEnd()) continue; if (ctx.getEnd() - 1 < bedLine.getStart()) continue; String newannot = this.parsedFormat.toString(bedLine); if (!newannot.isEmpty()) annotations.add(VCFUtils.escapeInfoField(newannot)); } CloserUtil.close(iter); if (annotations.isEmpty()) { w.add(ctx); continue; } VariantContextBuilder vcb = new VariantContextBuilder(ctx); vcb.attribute(TAG, annotations.toArray()); w.add(vcb.make()); incrVariantCount(); if (checkOutputError()) break; } progress.finish(); }
private int calculateTumorNNR(final VariantContext vc) { int nnr = 0; switch (vc.getGenotype(tumorSample).getType()) { case HET: case HOM_VAR: nnr += 1; break; case NO_CALL: case UNAVAILABLE: case HOM_REF: break; } return nnr; }
/** * Returns a list of attribute values from a VCF file * * @param vcfFile VCF file * @param attributeName attribute name * @throws IOException if the file does not exist or can not be opened * @return list of attribute values */ private List<String> getAttributeValues(final File vcfFile, final String attributeName) throws IOException { final VCFCodec codec = new VCFCodec(); final FileInputStream s = new FileInputStream(vcfFile); final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s)); codec.readHeader(lineIteratorVCF); List<String> attributeValues = new ArrayList<String>(); while (lineIteratorVCF.hasNext()) { final String line = lineIteratorVCF.next(); Assert.assertFalse(line == null); final VariantContext vc = codec.decode(line); for (final Genotype g : vc.getGenotypes()) { if (g.hasExtendedAttribute(attributeName)) { attributeValues.add((String) g.getExtendedAttribute(attributeName)); } } } return attributeValues; }
private int calculateTumorAC(final VariantContext vc) { int ac = 0; switch (vc.getGenotype(tumorSample).getType()) { case HET: ac += 1; break; case HOM_VAR: ac += 2; break; case NO_CALL: case UNAVAILABLE: case HOM_REF: break; } return ac; }
@Override public Map<String, Object> annotate( final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) { final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() < MIN_SAMPLES) { if (!warningLogged) { logger.warn("Too few genotypes"); warningLogged = true; } return null; } int refCount = 0; int hetCount = 0; int homCount = 0; for (final Genotype g : genotypes) { if (g.isNoCall()) continue; // TODO - fix me: // Right now we just ignore genotypes that are not confident, but this throws off // our HW ratios. More analysis is needed to determine the right thing to do when // the genotyper cannot decide whether a given sample is het or hom var. if (g.getLog10PError() > MIN_LOG10_PERROR) continue; if (g.isHomRef()) refCount++; else if (g.isHet()) hetCount++; else homCount++; } if (refCount + hetCount + homCount == 0) return null; double pvalue = HardyWeinbergCalculation.hwCalculate(refCount, hetCount, homCount); // System.out.println(refCount + " " + hetCount + " " + homCount + " " + pvalue); Map<String, Object> map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue))); return map; }
private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, GenomeLoc loc) { if (!wroteHeader) { wroteHeader = true; // setup the header fields Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName()))); hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY); for (VCFHeaderLine field : hInfo) { if (field instanceof VCFFormatHeaderLine) { allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine) field).getID()); } } samples = new LinkedHashSet<String>(); if (sampleName != null) { samples.add(sampleName); } else { // try VCF first samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variants.getName())); if (samples.isEmpty()) { List<Feature> features = tracker.getValues(variants, loc); if (features.size() == 0) throw new IllegalStateException( "No rod data is present, but we just created a VariantContext"); Feature f = features.get(0); if (f instanceof RawHapMapFeature) samples.addAll(Arrays.asList(((RawHapMapFeature) f).getSampleIDs())); else samples.addAll(vc.getSampleNames()); } } vcfwriter.writeHeader(new VCFHeader(hInfo, samples)); } vc = GATKVariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); vcfwriter.add(vc); }
@Override public Map<String, Object> annotate( final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) { // Can only call from UnifiedGenotyper if (!(walker instanceof UnifiedGenotyper)) { if (!walkerIdentityCheckWarningLogged) { if (walker != null) logger.warn( "Annotation will not be calculated, must be called from UnifiedGenotyper, not " + walker.getClass().getName()); else logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper"); walkerIdentityCheckWarningLogged = true; } return null; } if (stratifiedContexts.isEmpty()) return null; // not meaningful when we're at an indel location: deletions that start at location N are by // definition called at the position N-1, and at position N-1 // there are no informative deletions in the pileup if (!vc.isSNP()) return null; int deletions = 0; int depth = 0; for (Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet()) { for (final PileupElement p : sample.getValue().getBasePileup()) { depth++; if (p.isDeletion()) deletions++; } } Map<String, Object> map = new HashMap<>(); map.put( getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double) deletions / (double) depth)); return map; }
private void addVariant(final VariantContext ctx) { if (!ctx.getChr().equals(genes.get(0).getChromosome())) return; if (ctx.getStart() >= chromEnd) return; if (ctx.getStart() < chromStart) return; positions.add(ctx.getStart()); for (String sample : ctx.getSampleNames()) { Genotype g = ctx.getGenotype(sample); if (!g.isAvailable()) continue; if (!g.isCalled()) continue; if (g.isNoCall()) continue; if (g.isNonInformative()) continue; Set<Integer> set = sample2positions.get(sample); if (set == null) { set = new HashSet<Integer>(); sample2positions.put(sample, set); } set.add(ctx.getStart()); } }
@Override public void accumulate(final VariantContext ctx) { logger.record(ctx.getContig(), ctx.getStart()); final String variantChrom = ctx.getContig(); final int variantPos = ctx.getStart(); // Skip anything a little too funky if (ctx.isFiltered()) return; if (!ctx.isVariant()) return; if (SKIP_CHROMS.contains(variantChrom)) return; for (final MendelianViolationMetrics trio : trios) { final Genotype momGt = ctx.getGenotype(trio.MOTHER); final Genotype dadGt = ctx.getGenotype(trio.FATHER); final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING); // if any genotype: // - has a non-snp allele; or // - lacks a reference allele // // then ignore this trio if (CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .anyMatch( gt -> gt.isHetNonRef() || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream()) .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) { continue; } // if between the trio there are more than 2 alleles including the reference, continue if (Stream.concat( Collections.singleton(ctx.getReference()).stream(), CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .flatMap(gt -> gt.getAlleles().stream())) .collect(Collectors.toSet()) .size() > 2) continue; // Test to make sure: // 1) That the site is in fact variant in the trio // 2) that the offspring doesn't have a really wacky het allele balance if (!isVariant(momGt, dadGt, kidGt)) continue; if (kidGt.isHet()) { final int[] ad = kidGt.getAD(); if (ad == null) continue; final List<Integer> adOfAlleles = kidGt .getAlleles() .stream() .map(a -> ad[ctx.getAlleleIndex(a)]) .collect(Collectors.toList()); final double minAlleleFraction = Math.min(adOfAlleles.get(0), adOfAlleles.get(1)) / (double) (adOfAlleles.get(0) + adOfAlleles.get(1)); if (minAlleleFraction < MIN_HET_FRACTION) continue; } /////////////////////////////////////////////////////////////// // Determine whether the offspring should be haploid at this // locus and which is the parental donor of the haploid genotype /////////////////////////////////////////////////////////////// boolean haploid = false; Genotype haploidParentalGenotype = null; if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) { if (trio.OFFSPRING_SEX == Sex.Female) { // famale haploid = false; } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) { // male but in PAR on X, so diploid haploid = false; } else { // male, out of PAR on X, haploid haploid = true; haploidParentalGenotype = momGt; } } // the PAR on the male chromosome should be masked so that reads // align to the female chromosomes instead, so there's no point // of worrying about that here. if (MALE_CHROMS.contains(variantChrom)) { if (trio.OFFSPRING_SEX == Sex.Male) { haploid = true; haploidParentalGenotype = dadGt; } else { continue; } } // We only want to look at sites where we have high enough confidence that the genotypes we // are looking at are // interesting. We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is // either GQ>=MIN_GQ or in the // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ. if (haploid && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ)) continue; if (!haploid && (momGt.isNoCall() || momGt.getGQ() < MIN_GQ || dadGt.isNoCall() || dadGt.getGQ() < MIN_GQ)) continue; if (kidGt.isNoCall()) continue; if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { if (kidGt.getPL()[0] < MIN_GQ) continue; } else if (kidGt.getGQ() < MIN_GQ) continue; // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too // low if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue; if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP)) continue; trio.NUM_VARIANT_SITES++; /////////////////////////////////////////////////////////////// // First test for haploid violations /////////////////////////////////////////////////////////////// MendelianViolation type = null; if (haploid) { if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) { if (kidGt.isHomRef()) { type = MendelianViolation.Haploid_Other; trio.NUM_HAPLOID_OTHER++; } else { type = MendelianViolation.Haploid_Denovo; trio.NUM_HAPLOID_DENOVO++; } } } /////////////////////////////////////////////////////////////// // Then test for diploid mendelian violations /////////////////////////////////////////////////////////////// else if (isMendelianViolation(momGt, dadGt, kidGt)) { if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { trio.NUM_DIPLOID_DENOVO++; type = MendelianViolation.Diploid_Denovo; } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) { trio.NUM_HOMVAR_HOMVAR_HET++; type = MendelianViolation.HomVar_HomVar_Het; } else if (kidGt.isHom() && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) { trio.NUM_HOMREF_HOMVAR_HOM++; type = MendelianViolation.HomRef_HomVar_Hom; } else if (kidGt.isHom() && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) { trio.NUM_HOM_HET_HOM++; type = MendelianViolation.Hom_Het_Hom; } else { trio.NUM_OTHER++; type = MendelianViolation.Other; } } // Output a record into the family's violation VCF if (type != null) { // Create a new Context subsetted to the three samples final VariantContextBuilder builder = new VariantContextBuilder(ctx); builder.genotypes( ctx.getGenotypes() .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING))); builder.attribute(MENDELIAN_VIOLATION_KEY, type.name()); // Copy over some useful attributes from the full context if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); // Write out the variant record familyToViolations.get(trio.FAMILY_ID).add(builder.make()); } } }