Exemple #1
0
  @Test
  public void testFixReverseComplementedGenotypes() {

    final Allele refA = Allele.create("A", true);
    final Allele altC = Allele.create("C", false);
    final GenotypesContext originalGenotypes = GenotypesContext.create(3);
    originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make());
    originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make());
    originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make());

    final Allele refT = Allele.create("T", true);
    final Allele altG = Allele.create("G", false);
    final GenotypesContext expectedGenotypes = GenotypesContext.create(3);
    expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make());
    expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make());
    expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make());

    final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(2);
    reverseComplementAlleleMap.put(refA, refT);
    reverseComplementAlleleMap.put(altC, altG);
    final GenotypesContext actualGenotypes =
        LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap);

    for (final String sample : Arrays.asList("homref", "het", "homvar")) {
      final List<Allele> expected = expectedGenotypes.get(sample).getAlleles();
      final List<Allele> actual = actualGenotypes.get(sample).getAlleles();
      Assert.assertEquals(expected.get(0), actual.get(0));
      Assert.assertEquals(expected.get(1), actual.get(1));
    }
  }
  /**
   * Test doesMaskCoverVariant() logic
   *
   * @param contig chromosome or contig name
   * @param start variant context start
   * @param stop variant context stop
   * @param maskName mask or filter name
   * @param maskExtension bases beyond the mask
   * @param vcBeforeLoc if true, variant context is before the genome location; if false, the
   *     converse is true.
   * @param expectedValue return the expected return value from doesMaskCoverVariant()
   */
  @Test(dataProvider = "VariantMaskData")
  public void TestDoesMaskCoverVariant(
      final String contig,
      final int start,
      final int stop,
      final String maskName,
      final int maskExtension,
      final boolean vcBeforeLoc,
      final boolean expectedValue) {

    // Build VariantContext
    final byte[] allele1 = Utils.dupBytes((byte) 'A', 1);
    final byte[] allele2 = Utils.dupBytes((byte) 'T', 2);

    final List<Allele> alleles = new ArrayList<Allele>(2);
    final Allele ref = Allele.create(allele1, true);
    final Allele alt = Allele.create(allele2, false);
    alleles.add(ref);
    alleles.add(alt);

    final VariantContext vc =
        new VariantContextBuilder("test", contig, start, stop, alleles).filter(vcFilter).make();

    boolean coversVariant =
        VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
    Assert.assertEquals(coversVariant, expectedValue);
  }
  private Collection<VariantContext> getVariantContexts(
      RefMetaDataTracker tracker, ReferenceContext ref) {

    List<Feature> features = tracker.getValues(variants, ref.getLocus());
    List<VariantContext> VCs = new ArrayList<VariantContext>(features.size());

    for (Feature record : features) {
      if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) {
        // we need to special case the HapMap format because indels aren't handled correctly
        if (record instanceof RawHapMapFeature) {

          // is it an indel?
          RawHapMapFeature hapmap = (RawHapMapFeature) record;
          if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING)
              || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) {
            // get the dbsnp object corresponding to this record (needed to help us distinguish
            // between insertions and deletions)
            VariantContext dbsnpVC = getDbsnp(hapmap.getName());
            if (dbsnpVC == null || dbsnpVC.isMixed()) continue;

            Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
            alleleMap.put(
                RawHapMapFeature.DELETION,
                Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion()));
            alleleMap.put(
                RawHapMapFeature.INSERTION,
                Allele.create(
                    (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1],
                    !dbsnpVC.isSimpleInsertion()));
            hapmap.setActualAlleles(alleleMap);

            // also, use the correct positioning for insertions
            hapmap.updatePosition(dbsnpVC.getStart());

            if (hapmap.getStart() < ref.getWindow().getStart()) {
              logger.warn(
                  "Hapmap record at "
                      + ref.getLocus()
                      + " represents an indel too large to be converted; skipping...");
              continue;
            }
          }
        }

        // ok, we might actually be able to turn this record in a variant context
        VariantContext vc =
            VariantContextAdaptors.toVariantContext(variants.getName(), record, ref);

        if (vc != null) // sometimes the track has odd stuff in it that can't be converted
        VCs.add(vc);
      }
    }

    return VCs;
  }
  private VariantContext makeVC() {
    final GenotypesContext testGC = GenotypesContext.create(2);
    final Allele refAllele = Allele.create("A", true);
    final Allele altAllele = Allele.create("T");

    return (new VariantContextBuilder())
        .alleles(Arrays.asList(refAllele, altAllele))
        .chr("1")
        .start(15L)
        .stop(15L)
        .genotypes(testGC)
        .make();
  }
  /**
   * Loop over all of the reads in this likelihood map and realign them to its most likely haplotype
   *
   * @param haplotypes the collection of haplotypes
   * @param paddedReferenceLoc the active region
   */
  public void realignReadsToMostLikelyHaplotype(
      final Collection<Haplotype> haplotypes, final GenomeLoc paddedReferenceLoc) {

    // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a
    // requirement currently
    final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<>(haplotypes.size());
    Haplotype refHaplotype = null;
    for (final Haplotype haplotype : haplotypes) {
      alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
      if (refHaplotype == null && haplotype.isReference()) refHaplotype = haplotype;
    }

    final Map<GATKSAMRecord, Map<Allele, Double>> newLikelihoodReadMap =
        new LinkedHashMap<>(likelihoodReadMap.size());
    for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet()) {
      final MostLikelyAllele bestAllele =
          PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
      final GATKSAMRecord alignedToRef =
          AlignmentUtils.createReadAlignedToRef(
              entry.getKey(),
              alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()),
              refHaplotype,
              paddedReferenceLoc.getStart(),
              bestAllele.isInformative());
      newLikelihoodReadMap.put(alignedToRef, entry.getValue());
    }

    likelihoodReadMap.clear();
    likelihoodReadMap.putAll(newLikelihoodReadMap);
  }
 static {
   final StringBuilder sb = new StringBuilder(51);
   testAlleles = new ArrayList<>(51);
   sb.append('A');
   for (int i = 0; i <= 50; i++) {
     testAlleles.add(Allele.create(sb.toString().getBytes(), i == 0));
     sb.append('A');
   }
 }
 /**
  * Generate testing alleles.
  *
  * <p>Basically all are random alleles given the maximum allele length.
  *
  * <p>So with a low max-allele-length and high allele-count you can force repeats.
  *
  * @param alleleCount number of alleles to generate.
  * @param maxAlleleLength the maximum length of the allele in bases.
  * @param skipIfRepeats throw an test-skip exception {@link SkipException} if the resulting
  *     allele-list has repeats, thus is size is less than {@code alleleCount}
  * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less
  *     than 1.
  * @return never {@code null}.
  */
 static AlleleList<Allele> alleleList(
     final int alleleCount, final int maxAlleleLength, final boolean skipIfRepeats) {
   final Allele[] alleles =
       AlleleListUnitTester.generateRandomAlleles(alleleCount, maxAlleleLength);
   if (alleleCount > 0) alleles[0] = Allele.create(alleles[0].getBases(), true);
   final AlleleList<Allele> alleleList = new IndexedAlleleList<>(alleles);
   if (skipIfRepeats && alleleList.alleleCount() != alleles.length)
     throw new SkipException("repeated alleles, should be infrequent");
   return alleleList;
 }
 /**
  * Generate testing alleles.
  *
  * <p>Basically all are random alleles given the maximum allele length.
  *
  * <p>So with a low max-allele-length and high allele-count you can force repeats.
  *
  * @param alleleCount number of alleles to generate.
  * @param maxAlleleLength the maximum length of the allele in bases.
  * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less
  *     than 1.
  * @return never {@code null}.
  */
 public static Allele[] generateRandomAlleles(final int alleleCount, final int maxAlleleLength) {
   if (maxAlleleLength < 1)
     throw new IllegalArgumentException("the max allele length cannot be less than 1");
   final Allele[] result = new Allele[alleleCount];
   for (int i = 0; i < alleleCount; i++) {
     final int alleleLength = rnd.nextInt(maxAlleleLength) + 1;
     result[i] = Allele.create(rndDNA.nextBases(alleleLength));
   }
   return result;
 }
  /** Debug method to dump contents of object into string for display */
  public String toString() {
    final StringBuilder sb = new StringBuilder();

    sb.append("Alelles in map:");
    for (final Allele a : alleles) {
      sb.append(a.getDisplayString() + ",");
    }
    sb.append("\n");
    for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> el :
        getLikelihoodReadMap().entrySet()) {
      for (final Map.Entry<Allele, Double> eli : el.getValue().entrySet()) {
        sb.append(
            "Read "
                + el.getKey().getReadName()
                + ". Allele:"
                + eli.getKey().getDisplayString()
                + " has likelihood="
                + Double.toString(eli.getValue())
                + "\n");
      }
    }
    return sb.toString();
  }
 /**
  * Outputs all intervals that are behind the current reference locus
  *
  * @param refLocus the current reference locus
  * @param refBase the reference allele
  */
 private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
   // output any intervals that were finished
   final List<GenomeLoc> toRemove = new LinkedList<>();
   for (GenomeLoc key : intervalMap.keySet()) {
     if (key.isBefore(refLocus)) {
       final IntervalStratification intervalStats = intervalMap.get(key);
       outputStatsToVCF(intervalStats, Allele.create(refBase, true));
       if (hasMissingLoci(intervalStats)) {
         outputMissingInterval(intervalStats);
       }
       toRemove.add(key);
     }
   }
   for (GenomeLoc key : toRemove) {
     intervalMap.remove(key);
   }
 }
  @Test
  public void testPerReadAlleleLikelihoodMap() {
    final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();

    final Allele alleleA = Allele.create("A");
    final double lik = -1.0; // ignored

    final int[] MQs = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, QualityUtils.MAPPING_QUALITY_UNAVAILABLE};
    final List<Integer> MQsList = Arrays.asList(ArrayUtils.toObject(MQs));

    // MQ 255 are excluded from the calculations, we test it here.
    final List<Integer> MQsListOK = new ArrayList<>(MQsList);
    // NOTE: if we just call remove(i), Java thinks i is an index.
    // A workaround for this overloading bogosity to to call removeAll and pass a collection
    // (casting i to (Object) would work too but it's more error prone)
    MQsListOK.removeAll(Collections.singleton(QualityUtils.MAPPING_QUALITY_UNAVAILABLE));

    final int n1A = MQs.length;
    for (int i = 0; i < n1A; i++) {
      final GATKRead read = ArtificialReadUtils.createArtificialRead(TextCigarCodec.decode("10M"));
      read.setMappingQuality(MQs[i]);
      map.add(read, alleleA, lik);
    }

    final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap =
        Collections.singletonMap("sample1", map);
    final VariantContext vc = makeVC();
    final ReferenceContext referenceContext = null;
    final Map<String, Object> annotate =
        new RMSMappingQuality().annotate(referenceContext, vc, perReadAlleleLikelihoodMap);
    Assert.assertEquals(annotate.size(), 1, "size");
    Assert.assertEquals(
        annotate.keySet(), Collections.singleton(VCFConstants.RMS_MAPPING_QUALITY_KEY), "annots");
    final double rms = MathUtils.rms(MQsListOK); // only those are MQ0
    Assert.assertEquals(
        annotate.get(VCFConstants.RMS_MAPPING_QUALITY_KEY), String.format("%.2f", rms));
  }
Exemple #12
0
  @Override
  protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsReadable(CHAIN);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsWritable(REJECT);

    ////////////////////////////////////////////////////////////////////////
    // Setup the inputs
    ////////////////////////////////////////////////////////////////////////
    final LiftOver liftOver = new LiftOver(CHAIN);
    final VCFFileReader in = new VCFFileReader(INPUT, false);

    logger.info("Loading up the target reference genome.");
    final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final Map<String, byte[]> refSeqs = new HashMap<>();
    for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
      refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
    }
    CloserUtil.close(walker);

    ////////////////////////////////////////////////////////////////////////
    // Setup the outputs
    ////////////////////////////////////////////////////////////////////////
    final VCFHeader inHeader = in.getFileHeader();
    final VCFHeader outHeader = new VCFHeader(inHeader);
    outHeader.setSequenceDictionary(walker.getSequenceDictionary());
    final VariantContextWriter out =
        new VariantContextWriterBuilder()
            .setOption(Options.INDEX_ON_THE_FLY)
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(walker.getSequenceDictionary())
            .build();
    out.writeHeader(outHeader);

    final VariantContextWriter rejects =
        new VariantContextWriterBuilder()
            .setOutputFile(REJECT)
            .unsetOption(Options.INDEX_ON_THE_FLY)
            .build();
    final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
    for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
    rejects.writeHeader(rejectHeader);

    ////////////////////////////////////////////////////////////////////////
    // Read the input VCF, lift the records over and write to the sorting
    // collection.
    ////////////////////////////////////////////////////////////////////////
    long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
    logger.info("Lifting variants over and sorting.");

    final SortingCollection<VariantContext> sorter =
        SortingCollection.newInstance(
            VariantContext.class,
            new VCFRecordCodec(outHeader),
            outHeader.getVCFRecordComparator(),
            MAX_RECORDS_IN_RAM,
            TMP_DIR);

    ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");

    for (final VariantContext ctx : in) {
      ++total;
      final Interval source =
          new Interval(
              ctx.getContig(),
              ctx.getStart(),
              ctx.getEnd(),
              false,
              ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
      final Interval target = liftOver.liftOver(source, 1.0);

      if (target == null) {
        rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
        failedLiftover++;
      } else {
        // Fix the alleles if we went from positive to negative strand
        final List<Allele> alleles = new ArrayList<>();
        for (final Allele oldAllele : ctx.getAlleles()) {
          if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
            alleles.add(oldAllele);
          } else {
            alleles.add(
                Allele.create(
                    SequenceUtil.reverseComplement(oldAllele.getBaseString()),
                    oldAllele.isReference()));
          }
        }

        // Build the new variant context
        final VariantContextBuilder builder =
            new VariantContextBuilder(
                ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);

        builder.id(ctx.getID());
        builder.attributes(ctx.getAttributes());
        builder.genotypes(ctx.getGenotypes());
        builder.filters(ctx.getFilters());
        builder.log10PError(ctx.getLog10PError());

        // Check that the reference allele still agrees with the reference sequence
        boolean mismatchesReference = false;
        for (final Allele allele : builder.getAlleles()) {
          if (allele.isReference()) {
            final byte[] ref = refSeqs.get(target.getContig());
            final String refString =
                StringUtil.bytesToString(ref, target.getStart() - 1, target.length());

            if (!refString.equalsIgnoreCase(allele.getBaseString())) {
              mismatchesReference = true;
            }

            break;
          }
        }

        if (mismatchesReference) {
          rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
          failedAlleleCheck++;
        } else {
          sorter.add(builder.make());
        }
      }

      progress.record(ctx.getContig(), ctx.getStart());
    }

    final NumberFormat pfmt = new DecimalFormat("0.0000%");
    final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
    logger.info("Processed ", total, " variants.");
    logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
    logger.info(
        Long.toString(failedAlleleCheck),
        " variants lifted over but had mismatching reference alleles after lift over.");
    logger.info(pct, " of variants were not successfully lifted over and written to the output.");

    rejects.close();
    in.close();

    ////////////////////////////////////////////////////////////////////////
    // Write the sorted outputs to the final output file
    ////////////////////////////////////////////////////////////////////////
    sorter.doneAdding();
    progress = new ProgressLogger(logger, 1000000, "written");
    logger.info("Writing out sorted records to final VCF.");

    for (final VariantContext ctx : sorter) {
      out.add(ctx);
      progress.record(ctx.getContig(), ctx.getStart());
    }
    out.close();
    sorter.cleanup();

    return null;
  }
/**
 * Analyze coverage distribution and validate read mates per interval and per sample
 *
 * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It
 * analyzes each sample independently and aggregates results over intervals of interest.
 * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered
 * argument.
 *
 * <h3>Input</h3>
 *
 * <ul>
 *   <li>A reference file
 *   <li>one or more input BAMs
 *   <li>One or more intervals
 * </ul>
 *
 * <h3>Output</h3>
 *
 * <p>A modified VCF detailing each interval by sample and information for each interval according
 * to the thresholds used. Interval information includes GC Content, average interval depth,
 * callable status among others. If you use the --missing option, you can get as a second output a
 * intervals file with the loci that have missing data. This file can then be used as input to
 * QualifyMissingIntervals for full qualification and interpretation of why the data is missing.
 *
 * <h3>Usage example</h3>
 *
 * <pre>
 *    java -jar GenomeAnalysisTK.jar
 *              -T DiagnoseTargets \
 *              -R reference.fasta \
 *              -I sample1.bam \
 *              -I sample2.bam \
 *              -I sample3.bam \
 *              -L intervals.interval_list \
 *              -o output.vcf
 *  </pre>
 *
 * @author Mauricio Carneiro, Roger Zurawicki
 * @since 5/8/12
 */
@DocumentedGATKFeature(
    groupName = HelpConstants.DOCS_CAT_QC,
    extraDocs = {CommandLineGATK.class})
@By(value = DataSource.READS)
@PartitionBy(PartitionType.INTERVAL)
@Downsample(by = DownsampleType.NONE)
public class DiagnoseTargets extends LocusWalker<Long, Long> {

  @Output(doc = "File to which interval statistics should be written")
  private VariantContextWriter vcfWriter = null;

  @ArgumentCollection private ThresHolder thresholds = new ThresHolder();

  private Map<GenomeLoc, IntervalStratification> intervalMap =
      null; // maps each interval => statistics
  private PeekableIterator<GenomeLoc>
      intervalListIterator; // an iterator to go over all the intervals provided as we traverse the
                            // genome
  private Set<String> samples = null; // all the samples being processed
  private static final Allele SYMBOLIC_ALLELE =
      Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
  private static final Allele UNCOVERED_ALLELE =
      Allele.create(
          "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
  private static final int INITIAL_HASH_SIZE =
      50; // enough room for potential overlapping intervals plus recently finished intervals

  @Override
  public void initialize() {
    super.initialize();

    if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
      throw new UserException(
          "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");

    intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE);
    intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator());

    // get all of the unique sample names for the VCF Header
    samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
    vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples));

    // pre load all the statistics classes because it is costly to operate on the JVM and we only
    // want to do it once.
    loadAllPlugins(thresholds);
  }

  @Override
  public Long map(
      final RefMetaDataTracker tracker,
      final ReferenceContext ref,
      final AlignmentContext context) {
    GenomeLoc refLocus = ref.getLocus();

    // process and remove any intervals in the map that are don't overlap the current locus anymore
    // and add all new intervals that may overlap this reference locus
    addNewOverlappingIntervals(refLocus);
    outputFinishedIntervals(refLocus, ref.getBase());

    // at this point, all intervals in intervalMap overlap with this locus, so update all of them
    for (IntervalStratification intervalStratification : intervalMap.values())
      intervalStratification.addLocus(context, ref);

    return 1L;
  }

  @Override
  public Long reduceInit() {
    return 0L;
  }

  /**
   * Not sure what we are going to do here
   *
   * @param value result of the map.
   * @param sum accumulator for the reduce.
   * @return a long
   */
  @Override
  public Long reduce(Long value, Long sum) {
    return sum + value;
  }

  /**
   * Process all remaining intervals
   *
   * @param result number of loci processed by the walker
   */
  @Override
  public void onTraversalDone(final Long result) {
    for (GenomeLoc interval : intervalMap.keySet())
      outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);

    GenomeLoc interval = intervalListIterator.peek();
    while (interval != null) {
      outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE);
      intervalListIterator.next();
      interval = intervalListIterator.peek();
    }

    if (thresholds.missingTargets != null) {
      thresholds.missingTargets.close();
    }
  }

  /**
   * Outputs all intervals that are behind the current reference locus
   *
   * @param refLocus the current reference locus
   * @param refBase the reference allele
   */
  private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
    // output any intervals that were finished
    final List<GenomeLoc> toRemove = new LinkedList<>();
    for (GenomeLoc key : intervalMap.keySet()) {
      if (key.isBefore(refLocus)) {
        final IntervalStratification intervalStats = intervalMap.get(key);
        outputStatsToVCF(intervalStats, Allele.create(refBase, true));
        if (hasMissingLoci(intervalStats)) {
          outputMissingInterval(intervalStats);
        }
        toRemove.add(key);
      }
    }
    for (GenomeLoc key : toRemove) {
      intervalMap.remove(key);
    }
  }

  /**
   * Adds all intervals that overlap the current reference locus to the intervalMap
   *
   * @param refLocus the current reference locus
   */
  private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
    GenomeLoc interval = intervalListIterator.peek();
    while (interval != null && !interval.isPast(refLocus)) {
      intervalMap.put(interval, createIntervalStatistic(interval));
      intervalListIterator.next();
      interval = intervalListIterator.peek();
    }
  }

  /**
   * Takes the interval, finds it in the stash, prints it to the VCF
   *
   * @param stats The statistics of the interval
   * @param refAllele the reference allele
   */
  private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
    GenomeLoc interval = stats.getInterval();

    final List<Allele> alleles = new ArrayList<>();
    final Map<String, Object> attributes = new HashMap<>();
    final ArrayList<Genotype> genotypes = new ArrayList<>();

    for (String sample : samples) {
      final GenotypeBuilder gb = new GenotypeBuilder(sample);

      SampleStratification sampleStat = stats.getSampleStatistics(sample);
      gb.attribute(
          GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY,
          sampleStat.averageCoverage(interval.size()));
      gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
      gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
      gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));

      genotypes.add(gb.make());
    }
    alleles.add(refAllele);
    alleles.add(SYMBOLIC_ALLELE);
    VariantContextBuilder vcb =
        new VariantContextBuilder(
            "DiagnoseTargets",
            interval.getContig(),
            interval.getStart(),
            interval.getStop(),
            alleles);

    vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR);
    vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true)));

    attributes.put(VCFConstants.END_KEY, interval.getStop());
    attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
    attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent());

    vcb = vcb.attributes(attributes);
    vcb = vcb.genotypes(genotypes);

    vcfWriter.add(vcb.make());
  }

  private boolean hasMissingStatuses(AbstractStratification stats) {
    return !stats.callableStatuses().isEmpty();
  }

  private boolean hasMissingLoci(final IntervalStratification stats) {
    return thresholds.missingTargets != null && hasMissingStatuses(stats);
  }

  private void outputMissingInterval(final IntervalStratification stats) {
    final GenomeLoc interval = stats.getInterval();
    final boolean missing[] = new boolean[interval.size()];
    Arrays.fill(missing, true);
    for (AbstractStratification sample : stats.getElements()) {
      if (hasMissingStatuses(sample)) {
        int pos = 0;
        for (AbstractStratification locus : sample.getElements()) {
          if (locus.callableStatuses().isEmpty()) {
            missing[pos] = false;
          }
          pos++;
        }
      }
    }
    int start = -1;
    boolean insideMissing = false;
    for (int i = 0; i < missing.length; i++) {
      if (missing[i] && !insideMissing) {
        start = interval.getStart() + i;
        insideMissing = true;
      } else if (!missing[i] && insideMissing) {
        final int stop = interval.getStart() + i - 1;
        outputMissingInterval(interval.getContig(), start, stop);
        insideMissing = false;
      }
    }
    if (insideMissing) {
      outputMissingInterval(interval.getContig(), start, interval.getStop());
    }
  }

  private void outputMissingInterval(final String contig, final int start, final int stop) {
    final PrintStream out = thresholds.missingTargets;
    out.println(String.format("%s:%d-%d", contig, start, stop));
  }

  /**
   * Function that process a set of statuses into strings
   *
   * @param statuses the set of statuses to be converted
   * @return a matching set of strings
   */
  private List<String> statusToStrings(
      Iterable<CallableStatus> statuses, final boolean isInfoField) {
    List<String> output = new LinkedList<>();

    for (CallableStatus status : statuses)
      if (isInfoField || status != CallableStatus.PASS) output.add(status.name());

    return output;
  }

  private IntervalStratification createIntervalStatistic(GenomeLoc interval) {
    return new IntervalStratification(samples, interval, thresholds);
  }

  protected static void loadAllPlugins(final ThresHolder thresholds) {
    for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) {
      try {
        final LocusMetric stats = (LocusMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.locusMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }

    for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) {
      try {
        final SampleMetric stats = (SampleMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.sampleMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }

    for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) {
      try {
        final IntervalMetric stats = (IntervalMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.intervalMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }
  }

  /**
   * Gets the header lines for the VCF writer
   *
   * @return A set of VCF header lines
   */
  private static Set<VCFHeaderLine> getHeaderInfo() {
    Set<VCFHeaderLine> headerLines = new HashSet<>();

    // INFO fields for overall data
    headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
    headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY));
    headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY));
    headerLines.add(
        new VCFInfoHeaderLine(
            "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));

    // FORMAT fields for each genotype
    headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
    headerLines.add(
        GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY));
    headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI));
    headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI));

    // FILTER fields
    for (CallableStatus stat : CallableStatus.values())
      headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));

    return headerLines;
  }
}
/**
 * Helper class for those unit-test classes that test on implementations of SampleList.
 *
 * @author Valentin Ruano-Rubio &lt;[email protected]&gt;
 */
public class AlleleListUnitTester {

  private static final Random rnd = Utils.getRandomGenerator();
  private static final RandomDNA rndDNA = new RandomDNA(rnd);

  /**
   * Test that the contents of an allele-list are the ones expected.
   *
   * <p>
   *
   * <p>This method perform various consistency check involving all the {@link
   * org.broadinstitute.gatk.utils.genotyper.AlleleList} interface methods. Therefore calling this
   * method is equivalent to a thorough check of the {@link
   * org.broadinstitute.gatk.utils.genotyper.AlleleList} aspect of the {@code actual} argument.
   *
   * @param actual the sample-list to assess.
   * @param expected the expected sample-list.
   * @throws IllegalArgumentException if {@code expected} is {@code null} or contains {@code null}s
   *     which is an indication of an bug in the testing code.
   * @throws RuntimeException if there is some testing assertion exception which is an indication of
   *     an actual bug the code that is been tested.
   */
  public static <A extends Allele> void assertAlleleList(
      final AlleleList<A> actual, final List<A> expected) {
    if (expected == null) throw new IllegalArgumentException("the expected list cannot be null");
    final Set<A> expectedAlleleSet = new HashSet<>(expected.size());
    Assert.assertNotNull(actual);
    Assert.assertEquals(actual.alleleCount(), expected.size());
    for (int i = 0; i < expected.size(); i++) {
      final A expectedAllele = expected.get(i);
      if (expectedAllele == null)
        throw new IllegalArgumentException("the expected sample cannot be null");
      if (expectedAllele.equals(NEVER_USE_ALLELE))
        throw new IllegalArgumentException("you cannot use the forbidden sample name");
      if (expectedAlleleSet.contains(expected.get(i)))
        throw new IllegalArgumentException(
            "repeated allele in the expected list, this is a test bug");
      final A actualAllele = actual.alleleAt(i);
      Assert.assertNotNull(actualAllele, "allele cannot be null");
      Assert.assertFalse(
          expectedAlleleSet.contains(actualAllele), "repeated allele: " + actualAllele);
      Assert.assertEquals(actualAllele, expectedAllele, "wrong allele order; index = " + i);
      Assert.assertEquals(actual.alleleIndex(actualAllele), i, "allele index mismatch");
      expectedAlleleSet.add(actualAllele);
    }

    Assert.assertEquals(actual.alleleIndex((A) NEVER_USE_ALLELE), -1);
  }

  /** Save to assume that this allele will never be used. */
  private static final Allele NEVER_USE_ALLELE =
      Allele.create(
          new String("ACTGACTGACTGACTGACTGACTGACTGACTGGTCAGTCAGTCAGTCAGTCAGTCA").getBytes(), false);

  /**
   * Generate testing alleles.
   *
   * <p>Basically all are random alleles given the maximum allele length.
   *
   * <p>So with a low max-allele-length and high allele-count you can force repeats.
   *
   * @param alleleCount number of alleles to generate.
   * @param maxAlleleLength the maximum length of the allele in bases.
   * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less
   *     than 1.
   * @return never {@code null}.
   */
  public static Allele[] generateRandomAlleles(final int alleleCount, final int maxAlleleLength) {
    if (maxAlleleLength < 1)
      throw new IllegalArgumentException("the max allele length cannot be less than 1");
    final Allele[] result = new Allele[alleleCount];
    for (int i = 0; i < alleleCount; i++) {
      final int alleleLength = rnd.nextInt(maxAlleleLength) + 1;
      result[i] = Allele.create(rndDNA.nextBases(alleleLength));
    }
    return result;
  }

  /**
   * Generate testing alleles.
   *
   * <p>Basically all are random alleles given the maximum allele length.
   *
   * <p>So with a low max-allele-length and high allele-count you can force repeats.
   *
   * @param alleleCount number of alleles to generate.
   * @param maxAlleleLength the maximum length of the allele in bases.
   * @param skipIfRepeats throw an test-skip exception {@link SkipException} if the resulting
   *     allele-list has repeats, thus is size is less than {@code alleleCount}
   * @throws RuntimeException if {@code alleleCount} is negative or {@code maxAlleleLength} is less
   *     than 1.
   * @return never {@code null}.
   */
  static AlleleList<Allele> alleleList(
      final int alleleCount, final int maxAlleleLength, final boolean skipIfRepeats) {
    final Allele[] alleles =
        AlleleListUnitTester.generateRandomAlleles(alleleCount, maxAlleleLength);
    if (alleleCount > 0) alleles[0] = Allele.create(alleles[0].getBases(), true);
    final AlleleList<Allele> alleleList = new IndexedAlleleList<>(alleles);
    if (skipIfRepeats && alleleList.alleleCount() != alleles.length)
      throw new SkipException("repeated alleles, should be infrequent");
    return alleleList;
  }
}
Exemple #15
0
  @Override
  protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException {
    long nChanged = 0L;
    final String TAG = "INDELFIXED";
    VCFHeader header = r.getHeader();

    VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder());
    h2.addMetaDataLine(
        new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS."));

    w.writeHeader(h2);

    final Pattern dna = Pattern.compile("[ATGCatgc]+");
    while (r.hasNext()) {
      VariantContext ctx = r.next();
      VariantContextBuilder b = new VariantContextBuilder(ctx);
      List<Allele> alleles = ctx.getAlternateAlleles();
      if (alleles.size() != 1
          || !dna.matcher(ctx.getReference().getBaseString()).matches()
          || !dna.matcher(alleles.get(0).getBaseString()).matches()) {
        w.add(ctx);
        continue;
      }
      StringBuffer ref = new StringBuffer(ctx.getReference().getBaseString().toUpperCase());
      StringBuffer alt = new StringBuffer(alleles.get(0).getBaseString().toUpperCase());
      int start = ctx.getStart();
      int end = ctx.getEnd();

      boolean changed = false;

      /** ** we trim on the right side *** */
      // REF=TGCTGCGGGGGCCGCTGCGGGGG 	ALT=TGCTGCGGGGG
      while (alt.length() > 1
          && alt.length() < ref.length()
          && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) {
        changed = true;
        ref.setLength(ref.length() - 1);
        alt.deleteCharAt(alt.length() - 1);
        end--;
      }

      // REF=TGCTGCGGGGG 	ALT= TGCTGCGGGGGCCGCTGCGGGGG
      while (ref.length() > 1
          && alt.length() > ref.length()
          && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) {
        changed = true;
        ref.setLength(ref.length() - 1);
        alt.deleteCharAt(alt.length() - 1);
        end--;
      }

      /** ** we trim on the left side *** */

      // REF=TGCTGCGGGGGCCGCTGCGGGGG 	ALT=TGCTGCGGGGG
      while (alt.length() > 1 && alt.length() < ref.length() && ref.charAt(0) == alt.charAt(0)) {
        changed = true;
        ref.deleteCharAt(0);
        alt.deleteCharAt(0);
        start++;
      }

      // REF=TGCTGCGGGGG 	ALT= TGCTGCGGGGGCCGCTGCGGGGG
      while (ref.length() > 1 && alt.length() > ref.length() && ref.charAt(0) == alt.charAt(0)) {
        changed = true;
        ref.deleteCharAt(0);
        alt.deleteCharAt(0);
        start++;
      }

      if (!changed) {
        w.add(ctx);
        continue;
      }

      /*
      LOG.info(line);
      LOG.info("ctx.getStart() "+ctx.getStart());
      LOG.info("ctx.getEnd() "+ ctx.getEnd());



      LOG.info("start " + start);
      LOG.info("end "+end);
      LOG.info("ref " + ref.toString());
      LOG.info("alt "+alt.toString());
      */

      Allele newRef = Allele.create(ref.toString(), true);
      Allele newAlt = Allele.create(alt.toString(), false);

      Allele newalleles[] = new Allele[] {newRef, newAlt};

      b.attribute(
          TAG,
          ctx.getReference().getBaseString()
              + "|"
              + alleles.get(0).getBaseString()
              + "|"
              + ctx.getStart());
      b.start(start);
      b.stop(end);
      b.alleles(Arrays.asList(newalleles));

      nChanged++;

      VariantContext ctx2 = b.make();
      try {
        w.add(ctx2);
      } catch (TribbleException err) {
        error(err, "Cannot convert new context:" + ctx2 + " old context:" + ctx);
        w.add(ctx);
      }
    }

    info("indels changed:" + nChanged);
  }