Example #1
0
  public Map<String, Object> annotate(
      RefMetaDataTracker tracker,
      AnnotatorCompatibleWalker walker,
      ReferenceContext ref,
      Map<String, AlignmentContext> stratifiedContexts,
      VariantContext vc) {
    if (stratifiedContexts.size() == 0) return null;

    double mq0 = 0;
    double mq10 = 0;
    double total = 0;
    for (Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet()) {
      if (!sample.getValue().hasBasePileup()) continue;

      for (PileupElement p : sample.getValue().getBasePileup()) {
        if (p.getMappingQual() == 0) {
          mq0 += 1;
        }
        if (p.getMappingQual() <= 10) {
          mq10 += 1;
        }
        total += 1;
      }
    }
    Map<String, Object> map = new HashMap<String, Object>();
    map.put(
        getKeyNames().get(0), String.format("%.04f,%.04f,%.00f", mq0 / total, mq10 / total, total));
    return map;
  }
  // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we
  // consider a deletion as part of the pileup,
  // so that per-sample DP will include deletions covering the event.
  protected int getFilteredDepth(ReadBackedPileup pileup) {
    int count = 0;
    for (PileupElement p : pileup) {
      if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase())) count++;
    }

    return count;
  }
  @Test(dataProvider = "LIBSTest")
  public void testLIBS(LIBSTest params) {
    final int locus = 44367788;

    SAMRecord read =
        ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength);
    read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength));
    read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength));
    read.setCigarString(params.cigar);

    // create the iterator by state with the fake reads and fake records
    li = makeLTBS(Arrays.asList(read), createTestReadProperties());
    final LIBS_position tester = new LIBS_position(read);

    while (li.hasNext()) {
      AlignmentContext alignmentContext = li.next();
      ReadBackedPileup p = alignmentContext.getBasePileup();
      Assert.assertTrue(p.getNumberOfElements() == 1);
      PileupElement pe = p.iterator().next();

      tester.stepForwardOnGenome();

      Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase);
      Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart);
      Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase);
      Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd);
      Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion);
      Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion);
      Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip);
      Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset());
    }
  }
Example #4
0
  private Haplotype getHaplotypeFromRead(
      final PileupElement p, final int contextSize, final int locus) {
    final GATKSAMRecord read = p.getRead();
    int readOffsetFromPileup = p.getOffset();

    final byte[] haplotypeBases = new byte[contextSize];
    Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD);
    final double[] baseQualities = new double[contextSize];
    Arrays.fill(baseQualities, 0.0);

    byte[] readBases = read.getReadBases();
    readBases =
        AlignmentUtils.readToAlignmentByteArray(
            read.getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
        AlignmentUtils.readToAlignmentByteArray(
            read.getCigar(),
            readQuals); // Shift the location of the qual scores based on the Cigar string

    readOffsetFromPileup =
        AlignmentUtils.calcAlignmentByteArrayOffset(
            read.getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
        continue;
      }
      if (baseOffset >= readBases.length) {
        break;
      }
      if (readQuals[baseOffset] == PileupElement.DELETION_BASE) {
        readQuals[baseOffset] = PileupElement.DELETION_QUAL;
      }
      if (!BaseUtils.isRegularBase(readBases[baseOffset])) {
        readBases[baseOffset] = (byte) REGEXP_WILDCARD;
        readQuals[baseOffset] = (byte) 0;
      } // N's shouldn't be treated as distinct bases
      readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual());
      if (((int) readQuals[baseOffset]) < 5) {
        readQuals[baseOffset] = (byte) 0;
      } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind
        // them
      haplotypeBases[i] = readBases[baseOffset];
      baseQualities[i] = (double) readQuals[baseOffset];
    }

    return new Haplotype(haplotypeBases, baseQualities);
  }
  @Test
  public void testIndelsInRegularPileup() {
    final byte[] bases = new byte[] {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'};
    final byte[] indelBases =
        new byte[] {'A', 'A', 'A', 'A', 'C', 'T', 'A', 'A', 'A', 'A', 'A', 'A'};

    // create a test version of the Reads object
    ReadProperties readAttributes = createTestReadProperties();

    SAMRecord before = ArtificialSAMUtils.createArtificialRead(header, "before", 0, 1, 10);
    before.setReadBases(bases);
    before.setBaseQualities(new byte[] {20, 20, 20, 20, 20, 20, 20, 20, 20, 20});
    before.setCigarString("10M");

    SAMRecord during = ArtificialSAMUtils.createArtificialRead(header, "during", 0, 2, 10);
    during.setReadBases(indelBases);
    during.setBaseQualities(new byte[] {20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20});
    during.setCigarString("4M2I6M");

    SAMRecord after = ArtificialSAMUtils.createArtificialRead(header, "after", 0, 3, 10);
    after.setReadBases(bases);
    after.setBaseQualities(new byte[] {20, 20, 20, 20, 20, 20, 20, 20, 20, 20});
    after.setCigarString("10M");

    List<SAMRecord> reads = Arrays.asList(before, during, after);

    // create the iterator by state with the fake reads and fake records
    li = makeLTBS(reads, readAttributes);

    boolean foundIndel = false;
    while (li.hasNext()) {
      AlignmentContext context = li.next();
      ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10);
      for (PileupElement p : pileup) {
        if (p.isBeforeInsertion()) {
          foundIndel = true;
          Assert.assertEquals(p.getEventLength(), 2, "Wrong event length");
          Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect");
          break;
        }
      }
    }

    Assert.assertTrue(foundIndel, "Indel in pileup not found");
  }
  private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(
      UnifiedArgumentCollection UAC,
      ReferenceContext refContext,
      AlignmentContext rawContext,
      final GenotypeLikelihoodsCalculationModel.Model model) {

    if (!BaseUtils.isRegularBase(refContext.getBase())) return null;

    Map<String, AlignmentContext> stratifiedContexts = null;

    if (model.name().contains("INDEL")) {

      final ReadBackedPileup pileup =
          rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE);
      // don't call when there is no coverage
      if (pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES)
        return null;

      // stratify the AlignmentContext and cut by sample
      stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

    } else if (model.name().contains("SNP")) {

      // stratify the AlignmentContext and cut by sample
      stratifiedContexts =
          AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup());

      if (!(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES
          && UAC.GenotypingMode
              != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES)) {
        int numDeletions = 0;
        for (final PileupElement p : rawContext.getBasePileup()) {
          if (p.isDeletion()) numDeletions++;
        }
        if (((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements())
            > UAC.MAX_DELETION_FRACTION) {
          return null;
        }
      }
    }

    return stratifiedContexts;
  }
Example #7
0
  private static String createVerboseOutput(final ReadBackedPileup pileup) {
    final StringBuilder sb = new StringBuilder();
    boolean isFirst = true;

    sb.append(pileup.getNumberOfDeletions());
    sb.append(" ");

    for (PileupElement p : pileup) {
      if (isFirst) isFirst = false;
      else sb.append(",");
      sb.append(p.getRead().getReadName());
      sb.append(verboseDelimiter);
      sb.append(p.getOffset());
      sb.append(verboseDelimiter);
      sb.append(p.getRead().getReadLength());
      sb.append(verboseDelimiter);
      sb.append(p.getRead().getMappingQuality());
    }
    return sb.toString();
  }
  private Map<String, Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {

    if (!stratifiedContext.hasBasePileup()) return null;

    HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
    for (Allele allele : vc.getAlternateAlleles()) alleleCounts.put(allele.getBases()[0], 0);

    ReadBackedPileup pileup = stratifiedContext.getBasePileup();
    int totalDepth = pileup.size();

    Map<String, Object> map = new HashMap<String, Object>();
    map.put(getKeyNames().get(0), totalDepth); // put total depth in right away

    if (totalDepth == 0) return map; // done, can not compute FA at 0 coverage!!

    int mq0 = 0; // number of "ref" reads that are acually mq0
    for (PileupElement p : pileup) {
      if (p.getMappingQual() == 0) {
        mq0++;
        continue;
      }
      if (alleleCounts.containsKey(p.getBase())) // non-mq0 read and it's an alt
      alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1);
    }

    if (mq0 == totalDepth) return map; // if all reads are mq0, there is nothing left to do

    // we need to add counts in the correct order
    String[] fracs = new String[alleleCounts.size()];
    for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
      fracs[i] =
          String.format(
              "%.3f",
              ((float) alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))
                  / (totalDepth - mq0));
    }

    map.put(getKeyNames().get(1), fracs);
    return map;
  }
  /**
   * Test to make sure that reads supporting only an indel (example cigar string: 76I) are
   * represented properly
   */
  @Test
  public void testWholeIndelReadRepresentedTest() {
    final int firstLocus = 44367788, secondLocus = firstLocus + 1;

    SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, secondLocus, 1);
    read1.setReadBases(Utils.dupBytes((byte) 'A', 1));
    read1.setBaseQualities(Utils.dupBytes((byte) '@', 1));
    read1.setCigarString("1I");

    List<SAMRecord> reads = Arrays.asList(read1);

    // create the iterator by state with the fake reads and fake records
    li = makeLTBS(reads, createTestReadProperties());

    while (li.hasNext()) {
      AlignmentContext alignmentContext = li.next();
      ReadBackedPileup p = alignmentContext.getBasePileup();
      Assert.assertTrue(p.getNumberOfElements() == 1);
      PileupElement pe = p.iterator().next();
      Assert.assertTrue(pe.isBeforeInsertion());
      Assert.assertFalse(pe.isAfterInsertion());
      Assert.assertEquals(pe.getEventBases(), "A");
    }

    SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, secondLocus, 10);
    read2.setReadBases(Utils.dupBytes((byte) 'A', 10));
    read2.setBaseQualities(Utils.dupBytes((byte) '@', 10));
    read2.setCigarString("10I");

    reads = Arrays.asList(read2);

    // create the iterator by state with the fake reads and fake records
    li = makeLTBS(reads, createTestReadProperties());

    while (li.hasNext()) {
      AlignmentContext alignmentContext = li.next();
      ReadBackedPileup p = alignmentContext.getBasePileup();
      Assert.assertTrue(p.getNumberOfElements() == 1);
      PileupElement pe = p.iterator().next();
      Assert.assertTrue(pe.isBeforeInsertion());
      Assert.assertFalse(pe.isAfterInsertion());
      Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA");
    }
  }
 @Override
 protected Double getElementForPileupElement(final PileupElement p) {
   return (double) p.getRead().getMappingQuality();
 }
  @Override
  public T traverse(
      final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) {
    logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));

    final LocusView locusView = getLocusView(walker, dataProvider);
    final GenomeLocSortedSet initialIntervals = engine.getIntervals();

    final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider);
    final int activeRegionExtension =
        walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
    final int maxRegionSize =
        walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();

    if (locusView
        .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing
                      // here at all
      int minStart = Integer.MAX_VALUE;
      ActivityProfile profile =
          new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions());

      ReferenceOrderedView referenceOrderedDataView =
          getReferenceOrderedView(walker, dataProvider, locusView);

      // We keep processing while the next reference location is within the interval
      GenomeLoc prevLoc = null;
      while (locusView.hasNext()) {
        final AlignmentContext locus = locusView.next();
        GenomeLoc location = locus.getLocation();

        if (prevLoc != null) {
          // fill in the active / inactive labels from the stop of the previous location to the
          // start of this location
          // TODO refactor to separate function
          for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) {
            final GenomeLoc fakeLoc =
                engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
            if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) {
              profile.add(
                  fakeLoc,
                  new ActivityProfileResult(
                      walker.hasPresetActiveRegions()
                              && walker.presetActiveRegions.overlaps(fakeLoc)
                          ? 1.0
                          : 0.0));
            }
          }
        }

        dataProvider.getShard().getReadMetrics().incrementNumIterations();

        // create reference context. Note that if we have a pileup of "extended events", the context
        // will
        // hold the (longest) stretch of deleted reference bases (if deletions are present in the
        // pileup).
        final ReferenceContext refContext = referenceView.getReferenceContext(location);

        // Iterate forward to get all reference ordered data covering this location
        final RefMetaDataTracker tracker =
            referenceOrderedDataView.getReferenceOrderedDataAtLocus(
                locus.getLocation(), refContext);

        // Call the walkers isActive function for this locus and add them to the list to be
        // integrated later
        if (initialIntervals == null || initialIntervals.overlaps(location)) {
          profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));
        }

        // Grab all the previously unseen reads from this pileup and add them to the massive read
        // list
        for (final PileupElement p : locus.getBasePileup()) {
          final GATKSAMRecord read = p.getRead();
          if (!myReads.contains(read)) {
            myReads.add(read);
          }

          // If this is the last pileup for this shard calculate the minimum alignment start so that
          // we know
          // which active regions in the work queue are now safe to process
          minStart = Math.min(minStart, read.getAlignmentStart());
        }

        prevLoc = location;

        printProgress(locus.getLocation());
      }

      updateCumulativeMetrics(dataProvider.getShard());

      // Take the individual isActive calls and integrate them into contiguous active regions and
      // add these blocks of work to the work queue
      // band-pass filter the list of isActive probabilities and turn into active regions
      final ActivityProfile bandPassFiltered = profile.bandPassFilter();
      final List<ActiveRegion> activeRegions =
          bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize);

      // add active regions to queue of regions to process
      // first check if can merge active regions over shard boundaries
      if (!activeRegions.isEmpty()) {
        if (!workQueue.isEmpty()) {
          final ActiveRegion last = workQueue.getLast();
          final ActiveRegion first = activeRegions.get(0);
          if (last.isActive == first.isActive
              && last.getLocation().contiguousP(first.getLocation())
              && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) {
            workQueue.removeLast();
            activeRegions.remove(first);
            workQueue.add(
                new ActiveRegion(
                    last.getLocation().union(first.getLocation()),
                    first.isActive,
                    this.engine.getGenomeLocParser(),
                    activeRegionExtension));
          }
        }
        workQueue.addAll(activeRegions);
      }

      logger.debug(
          "Integrated "
              + profile.size()
              + " isActive calls into "
              + activeRegions.size()
              + " regions.");

      // now go and process all of the active regions
      sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
    }

    return sum;
  }
Example #12
0
  private double scoreReadAgainstHaplotype(
      final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) {
    double expected = 0.0;
    double mismatches = 0.0;

    // What's the expected mismatch rate under the model that this read is actually sampled from
    // this haplotype?  Let's assume the consensus base c is a random choice one of A, C, G, or T,
    // and that
    // the observed base is actually from a c with an error rate e.  Since e is the rate at which
    // we'd
    // see a miscalled c, the expected mismatch rate is really e.  So the expected number of
    // mismatches
    // is just sum_i e_i for i from 1..n for n sites
    //
    // Now, what's the probabilistic sum of mismatches?  Suppose that the base b is equal to c.
    // Well, it could
    // actually be a miscall in a matching direction, which would happen at a e / 3 rate.  If b !=
    // c, then
    // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be
    // a mismatch.
    // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1
    // ... n
    final byte[] haplotypeBases = haplotype.getBases();
    final GATKSAMRecord read = p.getRead();
    byte[] readBases = read.getReadBases();

    readBases =
        AlignmentUtils.readToAlignmentByteArray(
            p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
        AlignmentUtils.readToAlignmentByteArray(
            p.getRead().getCigar(),
            readQuals); // Shift the location of the qual scores based on the Cigar string
    int readOffsetFromPileup = p.getOffset();
    readOffsetFromPileup =
        AlignmentUtils.calcAlignmentByteArrayOffset(
            p.getRead().getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
        continue;
      }
      if (baseOffset >= readBases.length) {
        break;
      }

      final byte haplotypeBase = haplotypeBases[i];
      final byte readBase = readBases[baseOffset];

      final boolean matched =
          (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD);
      byte qual = readQuals[baseOffset];
      if (qual == PileupElement.DELETION_BASE) {
        qual = PileupElement.DELETION_QUAL;
      } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions
      qual = (byte) Math.min((int) qual, p.getMappingQual());
      if (((int) qual)
          >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning
                  // behind them
        final double e = QualityUtils.qualToErrorProb(qual);
        expected += e;
        mismatches += matched ? e : 1.0 - e / 3.0;
      }

      // a more sophisticated calculation would include the reference quality, but it's nice to
      // actually penalize
      // the mismatching of poorly determined regions of the consensus
    }

    return mismatches - expected;
  }