@Test public void testFixReverseComplementedGenotypes() { final Allele refA = Allele.create("A", true); final Allele altC = Allele.create("C", false); final GenotypesContext originalGenotypes = GenotypesContext.create(3); originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make()); originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make()); originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make()); final Allele refT = Allele.create("T", true); final Allele altG = Allele.create("G", false); final GenotypesContext expectedGenotypes = GenotypesContext.create(3); expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make()); expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make()); expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make()); final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(2); reverseComplementAlleleMap.put(refA, refT); reverseComplementAlleleMap.put(altC, altG); final GenotypesContext actualGenotypes = LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap); for (final String sample : Arrays.asList("homref", "het", "homvar")) { final List<Allele> expected = expectedGenotypes.get(sample).getAlleles(); final List<Allele> actual = actualGenotypes.get(sample).getAlleles(); Assert.assertEquals(expected.get(0), actual.get(0)); Assert.assertEquals(expected.get(1), actual.get(1)); } }
/** * Test doesMaskCoverVariant() logic * * @param contig chromosome or contig name * @param start variant context start * @param stop variant context stop * @param maskName mask or filter name * @param maskExtension bases beyond the mask * @param vcBeforeLoc if true, variant context is before the genome location; if false, the * converse is true. * @param expectedValue return the expected return value from doesMaskCoverVariant() */ @Test(dataProvider = "VariantMaskData") public void TestDoesMaskCoverVariant( final String contig, final int start, final int stop, final String maskName, final int maskExtension, final boolean vcBeforeLoc, final boolean expectedValue) { // Build VariantContext final byte[] allele1 = Utils.dupBytes((byte) 'A', 1); final byte[] allele2 = Utils.dupBytes((byte) 'T', 2); final List<Allele> alleles = new ArrayList<Allele>(2); final Allele ref = Allele.create(allele1, true); final Allele alt = Allele.create(allele2, false); alleles.add(ref); alleles.add(alt); final VariantContext vc = new VariantContextBuilder("test", contig, start, stop, alleles).filter(vcFilter).make(); boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc); Assert.assertEquals(coversVariant, expectedValue); }
private Collection<VariantContext> getVariantContexts( RefMetaDataTracker tracker, ReferenceContext ref) { List<Feature> features = tracker.getValues(variants, ref.getLocus()); List<VariantContext> VCs = new ArrayList<VariantContext>(features.size()); for (Feature record : features) { if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) { // we need to special case the HapMap format because indels aren't handled correctly if (record instanceof RawHapMapFeature) { // is it an indel? RawHapMapFeature hapmap = (RawHapMapFeature) record; if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) { // get the dbsnp object corresponding to this record (needed to help us distinguish // between insertions and deletions) VariantContext dbsnpVC = getDbsnp(hapmap.getName()); if (dbsnpVC == null || dbsnpVC.isMixed()) continue; Map<String, Allele> alleleMap = new HashMap<String, Allele>(2); alleleMap.put( RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion())); alleleMap.put( RawHapMapFeature.INSERTION, Allele.create( (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions hapmap.updatePosition(dbsnpVC.getStart()); if (hapmap.getStart() < ref.getWindow().getStart()) { logger.warn( "Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); continue; } } } // ok, we might actually be able to turn this record in a variant context VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref); if (vc != null) // sometimes the track has odd stuff in it that can't be converted VCs.add(vc); } } return VCs; }
private VariantContext makeVC() { final GenotypesContext testGC = GenotypesContext.create(2); final Allele refAllele = Allele.create("A", true); final Allele altAllele = Allele.create("T"); return (new VariantContextBuilder()) .alleles(Arrays.asList(refAllele, altAllele)) .chr("1") .start(15L) .stop(15L) .genotypes(testGC) .make(); }
/** * Loop over all of the reads in this likelihood map and realign them to its most likely haplotype * * @param haplotypes the collection of haplotypes * @param paddedReferenceLoc the active region */ public void realignReadsToMostLikelyHaplotype( final Collection<Haplotype> haplotypes, final GenomeLoc paddedReferenceLoc) { // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a // requirement currently final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); Haplotype refHaplotype = null; for (final Haplotype haplotype : haplotypes) { alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); if (refHaplotype == null && haplotype.isReference()) refHaplotype = haplotype; } final Map<GATKSAMRecord, Map<Allele, Double>> newLikelihoodReadMap = new LinkedHashMap<>(likelihoodReadMap.size()); for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet()) { final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); final GATKSAMRecord alignedToRef = AlignmentUtils.createReadAlignedToRef( entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), refHaplotype, paddedReferenceLoc.getStart(), bestAllele.isInformative()); newLikelihoodReadMap.put(alignedToRef, entry.getValue()); } likelihoodReadMap.clear(); likelihoodReadMap.putAll(newLikelihoodReadMap); }
static { final StringBuilder sb = new StringBuilder(51); testAlleles = new ArrayList<>(51); sb.append('A'); for (int i = 0; i <= 50; i++) { testAlleles.add(Allele.create(sb.toString().getBytes(), i == 0)); sb.append('A'); } }
/** * Generate testing alleles. * * <p>Basically all are random alleles given the maximum allele length. * * <p>So with a low max-allele-length and high allele-count you can force repeats. * * @param alleleCount number of alleles to generate. * @param maxAlleleLength the maximum length of the allele in bases. * @param skipIfRepeats throw an test-skip exception {@link SkipException} if the resulting * allele-list has repeats, thus is size is less than {@code alleleCount} * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less * than 1. * @return never {@code null}. */ static AlleleList<Allele> alleleList( final int alleleCount, final int maxAlleleLength, final boolean skipIfRepeats) { final Allele[] alleles = AlleleListUnitTester.generateRandomAlleles(alleleCount, maxAlleleLength); if (alleleCount > 0) alleles[0] = Allele.create(alleles[0].getBases(), true); final AlleleList<Allele> alleleList = new IndexedAlleleList<>(alleles); if (skipIfRepeats && alleleList.alleleCount() != alleles.length) throw new SkipException("repeated alleles, should be infrequent"); return alleleList; }
/** * Generate testing alleles. * * <p>Basically all are random alleles given the maximum allele length. * * <p>So with a low max-allele-length and high allele-count you can force repeats. * * @param alleleCount number of alleles to generate. * @param maxAlleleLength the maximum length of the allele in bases. * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less * than 1. * @return never {@code null}. */ public static Allele[] generateRandomAlleles(final int alleleCount, final int maxAlleleLength) { if (maxAlleleLength < 1) throw new IllegalArgumentException("the max allele length cannot be less than 1"); final Allele[] result = new Allele[alleleCount]; for (int i = 0; i < alleleCount; i++) { final int alleleLength = rnd.nextInt(maxAlleleLength) + 1; result[i] = Allele.create(rndDNA.nextBases(alleleLength)); } return result; }
/** Debug method to dump contents of object into string for display */ public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("Alelles in map:"); for (final Allele a : alleles) { sb.append(a.getDisplayString() + ","); } sb.append("\n"); for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> el : getLikelihoodReadMap().entrySet()) { for (final Map.Entry<Allele, Double> eli : el.getValue().entrySet()) { sb.append( "Read " + el.getKey().getReadName() + ". Allele:" + eli.getKey().getDisplayString() + " has likelihood=" + Double.toString(eli.getValue()) + "\n"); } } return sb.toString(); }
/** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } }
@Test public void testPerReadAlleleLikelihoodMap() { final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); final Allele alleleA = Allele.create("A"); final double lik = -1.0; // ignored final int[] MQs = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, QualityUtils.MAPPING_QUALITY_UNAVAILABLE}; final List<Integer> MQsList = Arrays.asList(ArrayUtils.toObject(MQs)); // MQ 255 are excluded from the calculations, we test it here. final List<Integer> MQsListOK = new ArrayList<>(MQsList); // NOTE: if we just call remove(i), Java thinks i is an index. // A workaround for this overloading bogosity to to call removeAll and pass a collection // (casting i to (Object) would work too but it's more error prone) MQsListOK.removeAll(Collections.singleton(QualityUtils.MAPPING_QUALITY_UNAVAILABLE)); final int n1A = MQs.length; for (int i = 0; i < n1A; i++) { final GATKRead read = ArtificialReadUtils.createArtificialRead(TextCigarCodec.decode("10M")); read.setMappingQuality(MQs[i]); map.add(read, alleleA, lik); } final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap = Collections.singletonMap("sample1", map); final VariantContext vc = makeVC(); final ReferenceContext referenceContext = null; final Map<String, Object> annotate = new RMSMappingQuality().annotate(referenceContext, vc, perReadAlleleLikelihoodMap); Assert.assertEquals(annotate.size(), 1, "size"); Assert.assertEquals( annotate.keySet(), Collections.singleton(VCFConstants.RMS_MAPPING_QUALITY_KEY), "annots"); final double rms = MathUtils.rms(MQsListOK); // only those are MQ0 Assert.assertEquals( annotate.get(VCFConstants.RMS_MAPPING_QUALITY_KEY), String.format("%.2f", rms)); }
@Override protected Object doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsWritable(REJECT); //////////////////////////////////////////////////////////////////////// // Setup the inputs //////////////////////////////////////////////////////////////////////// final LiftOver liftOver = new LiftOver(CHAIN); final VCFFileReader in = new VCFFileReader(INPUT, false); logger.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final Map<String, byte[]> refSeqs = new HashMap<>(); for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases()); } CloserUtil.close(walker); //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// final VCFHeader inHeader = in.getFileHeader(); final VCFHeader outHeader = new VCFHeader(inHeader); outHeader.setSequenceDictionary(walker.getSequenceDictionary()); final VariantContextWriter out = new VariantContextWriterBuilder() .setOption(Options.INDEX_ON_THE_FLY) .setOutputFile(OUTPUT) .setReferenceDictionary(walker.getSequenceDictionary()) .build(); out.writeHeader(outHeader); final VariantContextWriter rejects = new VariantContextWriterBuilder() .setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line); rejects.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// long failedLiftover = 0, failedAlleleCheck = 0, total = 0; logger.info("Lifting variants over and sorting."); final SortingCollection<VariantContext> sorter = SortingCollection.newInstance( VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); ProgressLogger progress = new ProgressLogger(logger, 1000000, "read"); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval( ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, 1.0); if (target == null) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make()); failedLiftover++; } else { // Fix the alleles if we went from positive to negative strand final List<Allele> alleles = new ArrayList<>(); for (final Allele oldAllele : ctx.getAlleles()) { if (target.isPositiveStrand() || oldAllele.isSymbolic()) { alleles.add(oldAllele); } else { alleles.add( Allele.create( SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference())); } } // Build the new variant context final VariantContextBuilder builder = new VariantContextBuilder( ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles); builder.id(ctx.getID()); builder.attributes(ctx.getAttributes()); builder.genotypes(ctx.getGenotypes()); builder.filters(ctx.getFilters()); builder.log10PError(ctx.getLog10PError()); // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeqs.get(target.getContig()); final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; } break; } } if (mismatchesReference) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); failedAlleleCheck++; } else { sorter.add(builder.make()); } } progress.record(ctx.getContig(), ctx.getStart()); } final NumberFormat pfmt = new DecimalFormat("0.0000%"); final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total); logger.info("Processed ", total, " variants."); logger.info(Long.toString(failedLiftover), " variants failed to liftover."); logger.info( Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over."); logger.info(pct, " of variants were not successfully lifted over and written to the output."); rejects.close(); in.close(); //////////////////////////////////////////////////////////////////////// // Write the sorted outputs to the final output file //////////////////////////////////////////////////////////////////////// sorter.doneAdding(); progress = new ProgressLogger(logger, 1000000, "written"); logger.info("Writing out sorted records to final VCF."); for (final VariantContext ctx : sorter) { out.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } out.close(); sorter.cleanup(); return null; }
/** * Analyze coverage distribution and validate read mates per interval and per sample * * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It * analyzes each sample independently and aggregates results over intervals of interest. * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered * argument. * * <h3>Input</h3> * * <ul> * <li>A reference file * <li>one or more input BAMs * <li>One or more intervals * </ul> * * <h3>Output</h3> * * <p>A modified VCF detailing each interval by sample and information for each interval according * to the thresholds used. Interval information includes GC Content, average interval depth, * callable status among others. If you use the --missing option, you can get as a second output a * intervals file with the loci that have missing data. This file can then be used as input to * QualifyMissingIntervals for full qualification and interpretation of why the data is missing. * * <h3>Usage example</h3> * * <pre> * java -jar GenomeAnalysisTK.jar * -T DiagnoseTargets \ * -R reference.fasta \ * -I sample1.bam \ * -I sample2.bam \ * -I sample3.bam \ * -L intervals.interval_list \ * -o output.vcf * </pre> * * @author Mauricio Carneiro, Roger Zurawicki * @since 5/8/12 */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}) @By(value = DataSource.READS) @PartitionBy(PartitionType.INTERVAL) @Downsample(by = DownsampleType.NONE) public class DiagnoseTargets extends LocusWalker<Long, Long> { @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the // genome private Set<String> samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create( "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals @Override public void initialize() { super.initialize(); if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException( "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // pre load all the statistics classes because it is costly to operate on the JVM and we only // want to do it once. loadAllPlugins(thresholds); } @Override public Long map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); // process and remove any intervals in the map that are don't overlap the current locus anymore // and add all new intervals that may overlap this reference locus addNewOverlappingIntervals(refLocus); outputFinishedIntervals(refLocus, ref.getBase()); // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStratification intervalStratification : intervalMap.values()) intervalStratification.addLocus(context, ref); return 1L; } @Override public Long reduceInit() { return 0L; } /** * Not sure what we are going to do here * * @param value result of the map. * @param sum accumulator for the reduce. * @return a long */ @Override public Long reduce(Long value, Long sum) { return sum + value; } /** * Process all remaining intervals * * @param result number of loci processed by the walker */ @Override public void onTraversalDone(final Long result) { for (GenomeLoc interval : intervalMap.keySet()) outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); GenomeLoc interval = intervalListIterator.peek(); while (interval != null) { outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); intervalListIterator.next(); interval = intervalListIterator.peek(); } if (thresholds.missingTargets != null) { thresholds.missingTargets.close(); } } /** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } } /** * Adds all intervals that overlap the current reference locus to the intervalMap * * @param refLocus the current reference locus */ private void addNewOverlappingIntervals(final GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); interval = intervalListIterator.peek(); } } /** * Takes the interval, finds it in the stash, prints it to the VCF * * @param stats The statistics of the interval * @param refAllele the reference allele */ private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); final List<Allele> alleles = new ArrayList<>(); final Map<String, Object> attributes = new HashMap<>(); final ArrayList<Genotype> genotypes = new ArrayList<>(); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStratification sampleStat = stats.getSampleStatistics(sample); gb.attribute( GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size())); gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder( "DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); } private boolean hasMissingStatuses(AbstractStratification stats) { return !stats.callableStatuses().isEmpty(); } private boolean hasMissingLoci(final IntervalStratification stats) { return thresholds.missingTargets != null && hasMissingStatuses(stats); } private void outputMissingInterval(final IntervalStratification stats) { final GenomeLoc interval = stats.getInterval(); final boolean missing[] = new boolean[interval.size()]; Arrays.fill(missing, true); for (AbstractStratification sample : stats.getElements()) { if (hasMissingStatuses(sample)) { int pos = 0; for (AbstractStratification locus : sample.getElements()) { if (locus.callableStatuses().isEmpty()) { missing[pos] = false; } pos++; } } } int start = -1; boolean insideMissing = false; for (int i = 0; i < missing.length; i++) { if (missing[i] && !insideMissing) { start = interval.getStart() + i; insideMissing = true; } else if (!missing[i] && insideMissing) { final int stop = interval.getStart() + i - 1; outputMissingInterval(interval.getContig(), start, stop); insideMissing = false; } } if (insideMissing) { outputMissingInterval(interval.getContig(), start, interval.getStop()); } } private void outputMissingInterval(final String contig, final int start, final int stop) { final PrintStream out = thresholds.missingTargets; out.println(String.format("%s:%d-%d", contig, start, stop)); } /** * Function that process a set of statuses into strings * * @param statuses the set of statuses to be converted * @return a matching set of strings */ private List<String> statusToStrings( Iterable<CallableStatus> statuses, final boolean isInfoField) { List<String> output = new LinkedList<>(); for (CallableStatus status : statuses) if (isInfoField || status != CallableStatus.PASS) output.add(status.name()); return output; } private IntervalStratification createIntervalStatistic(GenomeLoc interval) { return new IntervalStratification(samples, interval, thresholds); } protected static void loadAllPlugins(final ThresHolder thresholds) { for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) { try { final LocusMetric stats = (LocusMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.locusMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) { try { final SampleMetric stats = (SampleMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.sampleMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) { try { final IntervalMetric stats = (IntervalMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.intervalMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } } /** * Gets the header lines for the VCF writer * * @return A set of VCF header lines */ private static Set<VCFHeaderLine> getHeaderInfo() { Set<VCFHeaderLine> headerLines = new HashSet<>(); // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY)); headerLines.add( new VCFInfoHeaderLine( "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); headerLines.add( GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI)); // FILTER fields for (CallableStatus stat : CallableStatus.values()) headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); return headerLines; } }
/** * Helper class for those unit-test classes that test on implementations of SampleList. * * @author Valentin Ruano-Rubio <[email protected]> */ public class AlleleListUnitTester { private static final Random rnd = Utils.getRandomGenerator(); private static final RandomDNA rndDNA = new RandomDNA(rnd); /** * Test that the contents of an allele-list are the ones expected. * * <p> * * <p>This method perform various consistency check involving all the {@link * org.broadinstitute.gatk.utils.genotyper.AlleleList} interface methods. Therefore calling this * method is equivalent to a thorough check of the {@link * org.broadinstitute.gatk.utils.genotyper.AlleleList} aspect of the {@code actual} argument. * * @param actual the sample-list to assess. * @param expected the expected sample-list. * @throws IllegalArgumentException if {@code expected} is {@code null} or contains {@code null}s * which is an indication of an bug in the testing code. * @throws RuntimeException if there is some testing assertion exception which is an indication of * an actual bug the code that is been tested. */ public static <A extends Allele> void assertAlleleList( final AlleleList<A> actual, final List<A> expected) { if (expected == null) throw new IllegalArgumentException("the expected list cannot be null"); final Set<A> expectedAlleleSet = new HashSet<>(expected.size()); Assert.assertNotNull(actual); Assert.assertEquals(actual.alleleCount(), expected.size()); for (int i = 0; i < expected.size(); i++) { final A expectedAllele = expected.get(i); if (expectedAllele == null) throw new IllegalArgumentException("the expected sample cannot be null"); if (expectedAllele.equals(NEVER_USE_ALLELE)) throw new IllegalArgumentException("you cannot use the forbidden sample name"); if (expectedAlleleSet.contains(expected.get(i))) throw new IllegalArgumentException( "repeated allele in the expected list, this is a test bug"); final A actualAllele = actual.alleleAt(i); Assert.assertNotNull(actualAllele, "allele cannot be null"); Assert.assertFalse( expectedAlleleSet.contains(actualAllele), "repeated allele: " + actualAllele); Assert.assertEquals(actualAllele, expectedAllele, "wrong allele order; index = " + i); Assert.assertEquals(actual.alleleIndex(actualAllele), i, "allele index mismatch"); expectedAlleleSet.add(actualAllele); } Assert.assertEquals(actual.alleleIndex((A) NEVER_USE_ALLELE), -1); } /** Save to assume that this allele will never be used. */ private static final Allele NEVER_USE_ALLELE = Allele.create( new String("ACTGACTGACTGACTGACTGACTGACTGACTGGTCAGTCAGTCAGTCAGTCAGTCA").getBytes(), false); /** * Generate testing alleles. * * <p>Basically all are random alleles given the maximum allele length. * * <p>So with a low max-allele-length and high allele-count you can force repeats. * * @param alleleCount number of alleles to generate. * @param maxAlleleLength the maximum length of the allele in bases. * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less * than 1. * @return never {@code null}. */ public static Allele[] generateRandomAlleles(final int alleleCount, final int maxAlleleLength) { if (maxAlleleLength < 1) throw new IllegalArgumentException("the max allele length cannot be less than 1"); final Allele[] result = new Allele[alleleCount]; for (int i = 0; i < alleleCount; i++) { final int alleleLength = rnd.nextInt(maxAlleleLength) + 1; result[i] = Allele.create(rndDNA.nextBases(alleleLength)); } return result; } /** * Generate testing alleles. * * <p>Basically all are random alleles given the maximum allele length. * * <p>So with a low max-allele-length and high allele-count you can force repeats. * * @param alleleCount number of alleles to generate. * @param maxAlleleLength the maximum length of the allele in bases. * @param skipIfRepeats throw an test-skip exception {@link SkipException} if the resulting * allele-list has repeats, thus is size is less than {@code alleleCount} * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less * than 1. * @return never {@code null}. */ static AlleleList<Allele> alleleList( final int alleleCount, final int maxAlleleLength, final boolean skipIfRepeats) { final Allele[] alleles = AlleleListUnitTester.generateRandomAlleles(alleleCount, maxAlleleLength); if (alleleCount > 0) alleles[0] = Allele.create(alleles[0].getBases(), true); final AlleleList<Allele> alleleList = new IndexedAlleleList<>(alleles); if (skipIfRepeats && alleleList.alleleCount() != alleles.length) throw new SkipException("repeated alleles, should be infrequent"); return alleleList; } }
@Override protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException { long nChanged = 0L; final String TAG = "INDELFIXED"; VCFHeader header = r.getHeader(); VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder()); h2.addMetaDataLine( new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS.")); w.writeHeader(h2); final Pattern dna = Pattern.compile("[ATGCatgc]+"); while (r.hasNext()) { VariantContext ctx = r.next(); VariantContextBuilder b = new VariantContextBuilder(ctx); List<Allele> alleles = ctx.getAlternateAlleles(); if (alleles.size() != 1 || !dna.matcher(ctx.getReference().getBaseString()).matches() || !dna.matcher(alleles.get(0).getBaseString()).matches()) { w.add(ctx); continue; } StringBuffer ref = new StringBuffer(ctx.getReference().getBaseString().toUpperCase()); StringBuffer alt = new StringBuffer(alleles.get(0).getBaseString().toUpperCase()); int start = ctx.getStart(); int end = ctx.getEnd(); boolean changed = false; /** ** we trim on the right side *** */ // REF=TGCTGCGGGGGCCGCTGCGGGGG ALT=TGCTGCGGGGG while (alt.length() > 1 && alt.length() < ref.length() && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) { changed = true; ref.setLength(ref.length() - 1); alt.deleteCharAt(alt.length() - 1); end--; } // REF=TGCTGCGGGGG ALT= TGCTGCGGGGGCCGCTGCGGGGG while (ref.length() > 1 && alt.length() > ref.length() && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) { changed = true; ref.setLength(ref.length() - 1); alt.deleteCharAt(alt.length() - 1); end--; } /** ** we trim on the left side *** */ // REF=TGCTGCGGGGGCCGCTGCGGGGG ALT=TGCTGCGGGGG while (alt.length() > 1 && alt.length() < ref.length() && ref.charAt(0) == alt.charAt(0)) { changed = true; ref.deleteCharAt(0); alt.deleteCharAt(0); start++; } // REF=TGCTGCGGGGG ALT= TGCTGCGGGGGCCGCTGCGGGGG while (ref.length() > 1 && alt.length() > ref.length() && ref.charAt(0) == alt.charAt(0)) { changed = true; ref.deleteCharAt(0); alt.deleteCharAt(0); start++; } if (!changed) { w.add(ctx); continue; } /* LOG.info(line); LOG.info("ctx.getStart() "+ctx.getStart()); LOG.info("ctx.getEnd() "+ ctx.getEnd()); LOG.info("start " + start); LOG.info("end "+end); LOG.info("ref " + ref.toString()); LOG.info("alt "+alt.toString()); */ Allele newRef = Allele.create(ref.toString(), true); Allele newAlt = Allele.create(alt.toString(), false); Allele newalleles[] = new Allele[] {newRef, newAlt}; b.attribute( TAG, ctx.getReference().getBaseString() + "|" + alleles.get(0).getBaseString() + "|" + ctx.getStart()); b.start(start); b.stop(end); b.alleles(Arrays.asList(newalleles)); nChanged++; VariantContext ctx2 = b.make(); try { w.add(ctx2); } catch (TribbleException err) { error(err, "Cannot convert new context:" + ctx2 + " old context:" + ctx); w.add(ctx); } } info("indels changed:" + nChanged); }