@Test public void testFixReverseComplementedGenotypes() { final Allele refA = Allele.create("A", true); final Allele altC = Allele.create("C", false); final GenotypesContext originalGenotypes = GenotypesContext.create(3); originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make()); originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make()); originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make()); final Allele refT = Allele.create("T", true); final Allele altG = Allele.create("G", false); final GenotypesContext expectedGenotypes = GenotypesContext.create(3); expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make()); expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make()); expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make()); final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(2); reverseComplementAlleleMap.put(refA, refT); reverseComplementAlleleMap.put(altC, altG); final GenotypesContext actualGenotypes = LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap); for (final String sample : Arrays.asList("homref", "het", "homvar")) { final List<Allele> expected = expectedGenotypes.get(sample).getAlleles(); final List<Allele> actual = actualGenotypes.get(sample).getAlleles(); Assert.assertEquals(expected.get(0), actual.get(0)); Assert.assertEquals(expected.get(1), actual.get(1)); } }
private Collection<VariantContext> getVariantContexts( RefMetaDataTracker tracker, ReferenceContext ref) { List<Feature> features = tracker.getValues(variants, ref.getLocus()); List<VariantContext> VCs = new ArrayList<VariantContext>(features.size()); for (Feature record : features) { if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) { // we need to special case the HapMap format because indels aren't handled correctly if (record instanceof RawHapMapFeature) { // is it an indel? RawHapMapFeature hapmap = (RawHapMapFeature) record; if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) { // get the dbsnp object corresponding to this record (needed to help us distinguish // between insertions and deletions) VariantContext dbsnpVC = getDbsnp(hapmap.getName()); if (dbsnpVC == null || dbsnpVC.isMixed()) continue; Map<String, Allele> alleleMap = new HashMap<String, Allele>(2); alleleMap.put( RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion())); alleleMap.put( RawHapMapFeature.INSERTION, Allele.create( (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions hapmap.updatePosition(dbsnpVC.getStart()); if (hapmap.getStart() < ref.getWindow().getStart()) { logger.warn( "Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); continue; } } } // ok, we might actually be able to turn this record in a variant context VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref); if (vc != null) // sometimes the track has odd stuff in it that can't be converted VCs.add(vc); } } return VCs; }
private ReverseClippingPositionTestProvider( final int expectedClip, final String ref, final String... alleles) { super(ReverseClippingPositionTestProvider.class); this.ref = ref; for (final String allele : alleles) this.alleles.add(Allele.create(allele)); this.expectedClip = expectedClip; }
/** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * * @param vc variant context * @return new vc */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele HashMap<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(vc.getAlleles().size()); for (Allele originalAllele : vc.getAlleles()) { Allele newAllele; if (originalAllele.isNoCall() || originalAllele.isNull()) newAllele = originalAllele; else newAllele = Allele.create( BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); alleleMap.put(originalAllele, newAllele); } // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for (final Genotype genotype : vc.getGenotypes()) { List<Allele> newAlleles = new ArrayList<Allele>(); for (Allele allele : genotype.getAlleles()) { Allele newAllele = alleleMap.get(allele); if (newAllele == null) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); }
private RepeatDetectorTest( boolean isTrueRepeat, String ref, String refAlleleString, String... altAlleleStrings) { super(RepeatDetectorTest.class); this.ref = "N" + ref; // add a dummy base for the event here this.isTrueRepeat = isTrueRepeat; List<Allele> alleles = new LinkedList<Allele>(); final Allele refAllele = Allele.create(refAlleleString, true); alleles.add(refAllele); for (final String altString : altAlleleStrings) { final Allele alt = Allele.create(altString, false); alleles.add(alt); } VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles); this.vc = builder.make(); }
/** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } }
private VariantCallContext generateEmptyContext( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) { VariantContext vc; if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod( tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if (vcInput == null) return null; vc = new VariantContextBuilder( "UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()) .make(); } else { // deal with bad/non-standard reference bases if (!Allele.acceptableAlleleBases(new byte[] {ref.getBase()})) return null; Set<Allele> alleles = new HashSet<Allele>(); alleles.add(Allele.create(ref.getBase(), true)); vc = new VariantContextBuilder( "UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles) .make(); } if (annotationEngine != null) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); } return new VariantCallContext(vc, false); }
private Allele ensureMergedAllele( Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); Allele mergedAllele = mergedAlleles.get(all12); if (mergedAllele == null) { byte[] bases1 = all1.getBases(); byte[] bases2 = all2.getBases(); byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); if (intermediateBases != null) System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); mergedAlleles.put(all12, mergedAllele); } return mergedAllele; }
@BeforeSuite public void setup() { final File referenceFile = new File(b37KGReference); try { IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); } catch (FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(referenceFile, ex); } // alleles Aref = Allele.create("A", true); Cref = Allele.create("C", true); T = Allele.create("T"); C = Allele.create("C"); ATC = Allele.create("ATC"); ATCATC = Allele.create("ATCATC"); }
/** * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep * or all sites (if this is empty) * * @param reader a just-opened reader sitting at the start of the file * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should * be kept * @param parser a genome loc parser to create genome locs * @return a list of ExactCall objects in reader * @throws IOException */ public static List<ExactCall> readExactLog( final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException { if (reader == null) throw new IllegalArgumentException("reader cannot be null"); if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null"); if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null"); List<ExactCall> calls = new LinkedList<ExactCall>(); // skip the header line reader.readLine(); // skip the first "type" line reader.readLine(); while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List<Allele> alleles = new ArrayList<Allele>(); final List<Genotype> genotypes = new ArrayList<Genotype>(); final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true); final List<Integer> mle = new ArrayList<Integer>(); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(); long runtimeNano = -1; GenomeLoc currentLoc = null; while (true) { final String line = reader.readLine(); if (line == null) return calls; final String[] parts = line.split("\t"); final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); final String variable = parts[1]; final String key = parts[2]; final String value = parts[3]; if (currentLoc == null) currentLoc = lineLoc; if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {})); final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { final boolean isRef = key.equals("0"); alleles.add(Allele.create(value, isRef)); } else if (variable.equals("PL")) { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); } else if (variable.equals("log10PosteriorOfAFEq0")) { posteriors[0] = Double.valueOf(value); } else if (variable.equals("log10PosteriorOfAFGt0")) { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); } else if (variable.equals("pNonRefByAllele")) { final Allele a = Allele.create(key); log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { // nothing to do } } } }
/** * Analyze coverage distribution and validate read mates per interval and per sample * * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It * analyzes each sample independently and aggregates results over intervals of interest. * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered * argument. * * <h3>Input</h3> * * <ul> * <li>A reference file * <li>one or more input BAMs * <li>One or more intervals * </ul> * * <h3>Output</h3> * * <p>A modified VCF detailing each interval by sample and information for each interval according * to the thresholds used. Interval information includes GC Content, average interval depth, * callable status among others. If you use the --missing option, you can get as a second output a * intervals file with the loci that have missing data. This file can then be used as input to * QualifyMissingIntervals for full qualification and interpretation of why the data is missing. * * <h3>Usage example</h3> * * <pre> * java -jar GenomeAnalysisTK.jar * -T DiagnoseTargets \ * -R reference.fasta \ * -I sample1.bam \ * -I sample2.bam \ * -I sample3.bam \ * -L intervals.interval_list \ * -o output.vcf * </pre> * * @author Mauricio Carneiro, Roger Zurawicki * @since 5/8/12 */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}) @By(value = DataSource.READS) @PartitionBy(PartitionType.INTERVAL) @Downsample(by = DownsampleType.NONE) public class DiagnoseTargets extends LocusWalker<Long, Long> { @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the // genome private Set<String> samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create( "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals @Override public void initialize() { super.initialize(); if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException( "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // pre load all the statistics classes because it is costly to operate on the JVM and we only // want to do it once. loadAllPlugins(thresholds); } @Override public Long map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); // process and remove any intervals in the map that are don't overlap the current locus anymore // and add all new intervals that may overlap this reference locus addNewOverlappingIntervals(refLocus); outputFinishedIntervals(refLocus, ref.getBase()); // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStratification intervalStratification : intervalMap.values()) intervalStratification.addLocus(context, ref); return 1L; } @Override public Long reduceInit() { return 0L; } /** * Not sure what we are going to do here * * @param value result of the map. * @param sum accumulator for the reduce. * @return a long */ @Override public Long reduce(Long value, Long sum) { return sum + value; } /** * Process all remaining intervals * * @param result number of loci processed by the walker */ @Override public void onTraversalDone(final Long result) { for (GenomeLoc interval : intervalMap.keySet()) outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); GenomeLoc interval = intervalListIterator.peek(); while (interval != null) { outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); intervalListIterator.next(); interval = intervalListIterator.peek(); } if (thresholds.missingTargets != null) { thresholds.missingTargets.close(); } } /** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } } /** * Adds all intervals that overlap the current reference locus to the intervalMap * * @param refLocus the current reference locus */ private void addNewOverlappingIntervals(final GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); interval = intervalListIterator.peek(); } } /** * Takes the interval, finds it in the stash, prints it to the VCF * * @param stats The statistics of the interval * @param refAllele the reference allele */ private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); final List<Allele> alleles = new ArrayList<>(); final Map<String, Object> attributes = new HashMap<>(); final ArrayList<Genotype> genotypes = new ArrayList<>(); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStratification sampleStat = stats.getSampleStatistics(sample); gb.attribute( GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size())); gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder( "DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); } private boolean hasMissingStatuses(AbstractStratification stats) { return !stats.callableStatuses().isEmpty(); } private boolean hasMissingLoci(final IntervalStratification stats) { return thresholds.missingTargets != null && hasMissingStatuses(stats); } private void outputMissingInterval(final IntervalStratification stats) { final GenomeLoc interval = stats.getInterval(); final boolean missing[] = new boolean[interval.size()]; Arrays.fill(missing, true); for (AbstractStratification sample : stats.getElements()) { if (hasMissingStatuses(sample)) { int pos = 0; for (AbstractStratification locus : sample.getElements()) { if (locus.callableStatuses().isEmpty()) { missing[pos] = false; } pos++; } } } int start = -1; boolean insideMissing = false; for (int i = 0; i < missing.length; i++) { if (missing[i] && !insideMissing) { start = interval.getStart() + i; insideMissing = true; } else if (!missing[i] && insideMissing) { final int stop = interval.getStart() + i - 1; outputMissingInterval(interval.getContig(), start, stop); insideMissing = false; } } if (insideMissing) { outputMissingInterval(interval.getContig(), start, interval.getStop()); } } private void outputMissingInterval(final String contig, final int start, final int stop) { final PrintStream out = thresholds.missingTargets; out.println(String.format("%s:%d-%d", contig, start, stop)); } /** * Function that process a set of statuses into strings * * @param statuses the set of statuses to be converted * @return a matching set of strings */ private List<String> statusToStrings( Iterable<CallableStatus> statuses, final boolean isInfoField) { List<String> output = new LinkedList<>(); for (CallableStatus status : statuses) if (isInfoField || status != CallableStatus.PASS) output.add(status.name()); return output; } private IntervalStratification createIntervalStatistic(GenomeLoc interval) { return new IntervalStratification(samples, interval, thresholds); } protected static void loadAllPlugins(final ThresHolder thresholds) { for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) { try { final LocusMetric stats = (LocusMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.locusMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) { try { final SampleMetric stats = (SampleMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.sampleMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) { try { final IntervalMetric stats = (IntervalMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.intervalMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } } /** * Gets the header lines for the VCF writer * * @return A set of VCF header lines */ private static Set<VCFHeaderLine> getHeaderInfo() { Set<VCFHeaderLine> headerLines = new HashSet<>(); // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY)); headerLines.add( new VCFInfoHeaderLine( "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); headerLines.add( GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI)); // FILTER fields for (CallableStatus stat : CallableStatus.values()) headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); return headerLines; } }
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; // We need to trim common reference base from all alleles in all genotypes if a ref base is // common to all alleles Allele refAllele = inputVC.getReference(); if (!inputVC.isVariant()) trimVC = false; else if (refAllele.isNull()) trimVC = false; else { trimVC = (AbstractVCFCodec.computeForwardClipping( new ArrayList<Allele>(inputVC.getAlternateAlleles()), inputVC.getReference().getDisplayString()) > 0); } // nothing to do if we don't need to trim bases if (trimVC) { List<Allele> alleles = new ArrayList<Allele>(); GenotypesContext genotypes = GenotypesContext.create(); // set the reference base for indels in the attributes Map<String, Object> attributes = new TreeMap<String, Object>(inputVC.getAttributes()); Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>(); for (Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); } else { // get bases for current allele and create a new one with trimmed bases byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); Allele trimmedAllele = Allele.create(newBases, a.isReference()); alleles.add(trimmedAllele); originalToTrimmedAlleleMap.put(a, trimmedAllele); } } // detect case where we're trimming bases but resulting vc doesn't have any null allele. In // that case, we keep original representation // example: mixed records such as {TA*,TGA,TG} boolean hasNullAlleles = false; for (Allele a : originalToTrimmedAlleleMap.values()) { if (a.isNull()) hasNullAlleles = true; if (a.isReference()) refAllele = a; } if (!hasNullAlleles) return inputVC; // now we can recreate new genotypes with trimmed alleles for (final Genotype genotype : inputVC.getGenotypes()) { List<Allele> originalAlleles = genotype.getAlleles(); List<Allele> trimmedAlleles = new ArrayList<Allele>(); for (Allele a : originalAlleles) { if (a.isCalled()) trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); else trimmedAlleles.add(Allele.NO_CALL); } genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); return builder .alleles(alleles) .genotypes(genotypes) .attributes(attributes) .referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])) .make(); } return inputVC; }
public static VariantContext createVariantContextWithPaddedAlleles( VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { // see if we need to pad common reference base from all alleles boolean padVC; // We need to pad a VC with a common base if the length of the reference allele is less than the // length of the VariantContext. // This happens because the position of e.g. an indel is always one before the actual event (as // per VCF convention). long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; if (inputVC.hasSymbolicAlleles()) padVC = true; else if (inputVC.getReference().length() == locLength) padVC = false; else if (inputVC.getReference().length() == locLength - 1) padVC = true; else throw new IllegalArgumentException( "Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); // nothing to do if we don't need to pad bases if (padVC) { if (!inputVC.hasReferenceBaseForIndel()) throw new ReviewedStingException( "Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); Byte refByte = inputVC.getReferenceBaseForIndel(); List<Allele> alleles = new ArrayList<Allele>(); for (Allele a : inputVC.getAlleles()) { // get bases for current allele and create a new one with trimmed bases if (a.isSymbolic()) { alleles.add(a); } else { String newBases; if (refBaseShouldBeAppliedToEndOfAlleles) newBases = a.getBaseString() + new String(new byte[] {refByte}); else newBases = new String(new byte[] {refByte}) + a.getBaseString(); alleles.add(Allele.create(newBases, a.isReference())); } } // now we can recreate new genotypes with trimmed alleles GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); for (final Genotype g : inputVC.getGenotypes()) { List<Allele> inAlleles = g.getAlleles(); List<Allele> newGenotypeAlleles = new ArrayList<Allele>(g.getAlleles().size()); for (Allele a : inAlleles) { if (a.isCalled()) { if (a.isSymbolic()) { newGenotypeAlleles.add(a); } else { String newBases; if (refBaseShouldBeAppliedToEndOfAlleles) newBases = a.getBaseString() + new String(new byte[] {refByte}); else newBases = new String(new byte[] {refByte}) + a.getBaseString(); newGenotypeAlleles.add(Allele.create(newBases, a.isReference())); } } else { // add no-call allele newGenotypeAlleles.add(Allele.NO_CALL); } } genotypes.add( new Genotype( g.getSampleName(), newGenotypeAlleles, g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased())); } return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); } else return inputVC; }