Пример #1
   * Will hard clip every soft clipped bases in the read.
   * @return a new read without the soft clipped bases
  private GATKSAMRecord hardClipSoftClippedBases() {
    if (read.isEmpty()) return read;

    int readIndex = 0;
    int cutLeft = -1; // first position to hard clip (inclusive)
    int cutRight = -1; // first position to hard clip (inclusive)
    boolean rightTail =
        false; // trigger to stop clipping the left tail and start cutting the right tail

    for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
      if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
        if (rightTail) {
          cutRight = readIndex;
        } else {
          cutLeft = readIndex + cigarElement.getLength() - 1;
      } else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) rightTail = true;

      if (cigarElement.getOperator().consumesReadBases()) readIndex += cigarElement.getLength();

    // It is extremely important that we cut the end first otherwise the read coordinates change.
    if (cutRight >= 0) this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1));
    if (cutLeft >= 0) this.addOp(new ClippingOp(0, cutLeft));

    return clipRead(ClippingRepresentation.HARDCLIP_BASES);
Пример #2
 public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
   final String readGroupId = read.getReadGroup().getReadGroupId();
   for (int i = 0; i < read.getReadLength(); i++) {
     comparable[i] = readGroupId;
 private static void runTest(
     final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) {
   GATKSAMRecord read = ReadUtils.createRandomRead(10);
   ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
   covariate.recordValues(read, readCovariates);
   verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate);
Пример #4
 public static List<GATKSAMRecord> hardClipToRegion(
     final List<GATKSAMRecord> reads, final int refStart, final int refStop) {
   final List<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>(reads.size());
   for (final GATKSAMRecord read : reads) {
     final GATKSAMRecord clippedRead = hardClipToRegion(read, refStart, refStop);
     if (!clippedRead.isEmpty()) {
   return returnList;
Пример #5
   * Hard clips a read using read coordinates.
   * @param start the first base to clip (inclusive)
   * @param stop the last base to clip (inclusive)
   * @return a new read, without the clipped bases
    "start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read
    "start == 0 || stop == read.getReadLength() - 1"
  }) // cannot clip the middle of the read
  private GATKSAMRecord hardClipByReadCoordinates(int start, int stop) {
    if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
      return GATKSAMRecord.emptyRead(read);

    this.addOp(new ClippingOp(start, stop));
    return clipRead(ClippingRepresentation.HARDCLIP_BASES);
Пример #6
   * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the
   * end.
   * @return a new read without leading insertions
  private GATKSAMRecord hardClipLeadingInsertions() {
    if (read.isEmpty()) return read;

    for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
      if (cigarElement.getOperator() != CigarOperator.HARD_CLIP
          && cigarElement.getOperator() != CigarOperator.SOFT_CLIP
          && cigarElement.getOperator() != CigarOperator.INSERTION) break;
      else if (cigarElement.getOperator() == CigarOperator.INSERTION)
        this.addOp(new ClippingOp(0, cigarElement.getLength() - 1));
    return clipRead(ClippingRepresentation.HARDCLIP_BASES);
Пример #7
   * Hard clip the read to the variable region (from refStart to refStop)
   * @param read the read to be clipped
   * @param refStart the beginning of the variant region (inclusive)
   * @param refStop the end of the variant region (inclusive)
   * @return the read hard clipped to the variant region
  public static GATKSAMRecord hardClipToRegion(
      final GATKSAMRecord read, final int refStart, final int refStop) {
    final int start = read.getAlignmentStart();
    final int stop = read.getAlignmentEnd();

    // check if the read is contained in region
    if (start <= refStop && stop >= refStart) {
      if (start < refStart && stop > refStop)
        return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1);
      else if (start < refStart) return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1);
      else if (stop > refStop) return hardClipByReferenceCoordinatesRightTail(read, refStop + 1);
      return read;
    } else return GATKSAMRecord.emptyRead(read);
Пример #8
   * Hard clips both tails of a read. Left tail goes from the beginning to the 'left' coordinate
   * (inclusive) Right tail goes from the 'right' coordinate (inclusive) until the end of the read
   * @param left the coordinate of the last base to be clipped in the left tail (inclusive)
   * @param right the coordinate of the first base to be clipped in the right tail (inclusive)
   * @return a new read, without the clipped bases
    "left <= right", // tails cannot overlap
    "left >= read.getAlignmentStart()", // coordinate has to be within the mapped read
    "right <= read.getAlignmentEnd()"
  }) // coordinate has to be within the mapped read
  private GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
    if (read.isEmpty() || left == right) return GATKSAMRecord.emptyRead(read);
    GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1);

    // after clipping one tail, it is possible that the consequent hard clipping of adjacent
    // deletions
    // make the left cut index no longer part of the read. In that case, clip the read entirely.
    if (left > leftTailRead.getAlignmentEnd()) return GATKSAMRecord.emptyRead(read);

    ReadClipper clipper = new ReadClipper(leftTailRead);
    return clipper.hardClipByReferenceCoordinatesLeftTail(left);
Пример #9
   * Checks if a read contains adaptor sequences. If it does, hard clips them out.
   * <p>Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary()
   * @return a new read without adaptor sequence
  private GATKSAMRecord hardClipAdaptorSequence() {
    final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read);

    if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY
        || !ReadUtils.isInsideRead(read, adaptorBoundary)) return read;

    return read.getReadNegativeStrandFlag()
        ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary)
        : hardClipByReferenceCoordinatesRightTail(adaptorBoundary);
Пример #10
   * Clips a read according to ops and the chosen algorithm.
   * @param algorithm What mode of clipping do you want to apply for the stacked operations.
   * @return the read with the clipping applied.
  public GATKSAMRecord clipRead(ClippingRepresentation algorithm) {
    if (ops == null) return getRead();

    GATKSAMRecord clippedRead = read;
    for (ClippingOp op : getOps()) {
      final int readLength = clippedRead.getReadLength();
      // check if the clipped read can still be clipped in the range requested
      if (op.start < readLength) {
        ClippingOp fixedOperation = op;
        if (op.stop >= readLength) fixedOperation = new ClippingOp(op.start, readLength - 1);

        clippedRead = fixedOperation.apply(algorithm, clippedRead);
    wasClipped = true;
    if (clippedRead.isEmpty()) return GATKSAMRecord.emptyRead(clippedRead);
    return clippedRead;
Пример #11
  // copied from LocusViewTemplate
  protected GATKSAMRecord buildSAMRecord(
      final String readName, final String contig, final int alignmentStart) {
    GATKSAMRecord record = new GATKSAMRecord(header);



    return record;
Пример #12
   * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the
   * desired algorithm.
   * <p>This function will look for low quality tails and hard clip them away. A low quality tail
   * ends when a base has base quality greater than lowQual.
   * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...)
   * @param lowQual every base quality lower than or equal to this in the tail of the read will be
   *     hard clipped
   * @return a new read without low quality tails
  private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) {
    if (read.isEmpty()) return read;

    final byte[] quals = read.getBaseQualities();
    final int readLength = read.getReadLength();
    int leftClipIndex = 0;
    int rightClipIndex = readLength - 1;

    // check how far we can clip both sides
    while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--;
    while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++;

    // if the entire read should be clipped, then return an empty read.
    if (leftClipIndex > rightClipIndex) return GATKSAMRecord.emptyRead(read);

    if (rightClipIndex < readLength - 1) {
      this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1));
    if (leftClipIndex > 0) {
      this.addOp(new ClippingOp(0, leftClipIndex - 1));
    return this.clipRead(algorithm);
Пример #13
  private Haplotype getHaplotypeFromRead(
      final PileupElement p, final int contextSize, final int locus) {
    final GATKSAMRecord read = p.getRead();
    int readOffsetFromPileup = p.getOffset();

    final byte[] haplotypeBases = new byte[contextSize];
    Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD);
    final double[] baseQualities = new double[contextSize];
    Arrays.fill(baseQualities, 0.0);

    byte[] readBases = read.getReadBases();
    readBases =
            read.getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
            readQuals); // Shift the location of the qual scores based on the Cigar string

    readOffsetFromPileup =
            read.getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
      if (baseOffset >= readBases.length) {
      if (readQuals[baseOffset] == PileupElement.DELETION_BASE) {
        readQuals[baseOffset] = PileupElement.DELETION_QUAL;
      if (!BaseUtils.isRegularBase(readBases[baseOffset])) {
        readBases[baseOffset] = (byte) REGEXP_WILDCARD;
        readQuals[baseOffset] = (byte) 0;
      } // N's shouldn't be treated as distinct bases
      readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual());
      if (((int) readQuals[baseOffset]) < 5) {
        readQuals[baseOffset] = (byte) 0;
      } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind
        // them
      haplotypeBases[i] = readBases[baseOffset];
      baseQualities[i] = (double) readQuals[baseOffset];

    return new Haplotype(haplotypeBases, baseQualities);
Пример #14
   * Generic functionality to hard clip a read, used internally by
   * hardClipByReferenceCoordinatesLeftTail and hardClipByReferenceCoordinatesRightTail. Should not
   * be used directly.
   * <p>Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're
   * clipping the left of right tail) by specifying either refStart < 0 or refStop < 0.
   * @param refStart first base to clip (inclusive)
   * @param refStop last base to clip (inclusive)
   * @return a new read, without the clipped bases
    "refStart < 0 || refStop < 0"
  }) // can't handle unmapped reads, as we're using reference coordinates to clip
  protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
    if (read.isEmpty()) return read;

    int start;
    int stop;

    // Determine the read coordinate to start and stop hard clipping
    if (refStart < 0) {
      if (refStop < 0)
        throw new ReviewedStingException(
            "Only one of refStart or refStop must be < 0, not both ("
                + refStart
                + ", "
                + refStop
                + ")");
      start = 0;
      stop =
              read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
    } else {
      if (refStop >= 0)
        throw new ReviewedStingException(
            "Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")");
      start =
              read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
      stop = read.getReadLength() - 1;

    if (start < 0 || stop > read.getReadLength() - 1)
      throw new ReviewedStingException(
          "Trying to clip before the start or after the end of a read");

    if (start > stop)
      throw new ReviewedStingException(
              "START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)",
              start, stop, read, read.getCigarString()));

    if (start > 0 && stop < read.getReadLength() - 1)
      throw new ReviewedStingException(
              "Trying to clip the middle of the read: start %d, stop %d, cigar: %s",
              start, stop, read.getCigarString()));

    this.addOp(new ClippingOp(start, stop));
    GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
    this.ops = null;
    return clippedRead;
Пример #15
   * Turns soft clipped bases into matches
   * @return a new read with every soft clip turned into a match
  private GATKSAMRecord revertSoftClippedBases() {
    if (read.isEmpty()) return read;

    this.addOp(new ClippingOp(0, 0));
    return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES);
  private ArrayList<Allele> computeConsensusAlleles(
      ReferenceContext ref,
      Map<String, AlignmentContext> contexts,
      AlignmentContextUtils.ReadOrientation contextType) {
    Allele refAllele = null, altAllele = null;
    GenomeLoc loc = ref.getLocus();
    ArrayList<Allele> aList = new ArrayList<Allele>();

    HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();

    int insCount = 0, delCount = 0;
    // quick check of total number of indels in pileup
    for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
      AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);

      final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
      insCount += indelPileup.getNumberOfInsertions();
      delCount += indelPileup.getNumberOfDeletions();

    if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
      return aList;

    for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
      // todo -- warning, can be duplicating expensive partition here
      AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);

      final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();

      for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
        // SAMRecord read = p.getRead();
        GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
        if (read == null) continue;
        if (ReadUtils.is454Read(read)) {

        /*                if (DEBUG && p.isIndel()) {
                         System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
                                 p.getEventLength(),p.getType().toString(), p.getEventBases());

        String indelString = p.getEventBases();
        if (p.isInsertion()) {
          boolean foundKey = false;
          if (read.getAlignmentEnd() == loc.getStart()) {
            // first corner condition: a read has an insertion at the end, and we're right at the
            // insertion.
            // In this case, the read could have any of the inserted bases and we need to build a
            // consensus
            for (String s : consensusIndelStrings.keySet()) {
              int cnt = consensusIndelStrings.get(s);
              if (s.startsWith(indelString)) {
                // case 1: current insertion is prefix of indel in hash map
                consensusIndelStrings.put(s, cnt + 1);
                foundKey = true;
              } else if (indelString.startsWith(s)) {
                // case 2: indel stored in hash table is prefix of current insertion
                // In this case, new bases are new key.
                consensusIndelStrings.put(indelString, cnt + 1);
                foundKey = true;
            if (!foundKey)
              // none of the above: event bases not supported by previous table, so add new key
              consensusIndelStrings.put(indelString, 1);

          } else if (read.getAlignmentStart() == loc.getStart() + 1) {
            // opposite corner condition: read will start at current locus with an insertion
            for (String s : consensusIndelStrings.keySet()) {
              int cnt = consensusIndelStrings.get(s);
              if (s.endsWith(indelString)) {
                // case 1: current insertion is suffix of indel in hash map
                consensusIndelStrings.put(s, cnt + 1);
                foundKey = true;
              } else if (indelString.endsWith(s)) {
                // case 2: indel stored in hash table is suffix of current insertion
                // In this case, new bases are new key.

                consensusIndelStrings.put(indelString, cnt + 1);
                foundKey = true;
            if (!foundKey)
              // none of the above: event bases not supported by previous table, so add new key
              consensusIndelStrings.put(indelString, 1);

          } else {
            // normal case: insertion somewhere in the middle of a read: add count to hash map
            int cnt =
                    ? consensusIndelStrings.get(indelString)
                    : 0;
            consensusIndelStrings.put(indelString, cnt + 1);

        } else if (p.isDeletion()) {
          indelString = String.format("D%d", p.getEventLength());
          int cnt =
                  ? consensusIndelStrings.get(indelString)
                  : 0;
          consensusIndelStrings.put(indelString, cnt + 1);

      /*            if (DEBUG) {
          int icount = indelPileup.getNumberOfInsertions();
          int dcount = indelPileup.getNumberOfDeletions();
          if (icount + dcount > 0)
              List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
              System.out.format("#ins: %d, #del:%d\n", insCount, delCount);

              for (int i=0 ; i < eventStrings.size() ; i++ ) {
                  //                int k=0;
      }             */

    int maxAlleleCnt = 0;
    String bestAltAllele = "";
    for (String s : consensusIndelStrings.keySet()) {
      int curCnt = consensusIndelStrings.get(s);
      if (curCnt > maxAlleleCnt) {
        maxAlleleCnt = curCnt;
        bestAltAllele = s;
      //            if (DEBUG)
      //                System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s)  );
    } // gdebug-

    if (maxAlleleCnt < minIndelCountForGenotyping) return aList;

    if (bestAltAllele.startsWith("D")) {
      // get deletion length
      int dLen = Integer.valueOf(bestAltAllele.substring(1));
      // get ref bases of accurate deletion
      int startIdxInReference = (int) (1 + loc.getStart() - ref.getWindow().getStart());

      // System.out.println(new String(ref.getBases()));
      byte[] refBases =
          Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);

      if (Allele.acceptableAlleleBases(refBases)) {
        refAllele = Allele.create(refBases, true);
        altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
    } else {
      // insertion case
      if (Allele.acceptableAlleleBases(bestAltAllele)) {
        refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
        altAllele = Allele.create(bestAltAllele, false);
    if (refAllele != null && altAllele != null) {
      aList.add(0, refAllele);
      aList.add(1, altAllele);
    return aList;
Пример #17
  @Test(enabled = false)
  public void testCovariateGeneration() {
    final String RGID = "id";
    final int length = 10;
    final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
    GATKSAMRecord read = ReadUtils.createRandomRead(length, false);
    GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID);
    final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION);
    final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION);
    final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION);

    ReadGroupCovariate rgCov = new ReadGroupCovariate();
    QualityScoreCovariate qsCov = new QualityScoreCovariate();
    ContextCovariate coCov = new ContextCovariate();
    CycleCovariate cyCov = new CycleCovariate();


    Covariate[] requestedCovariates = new Covariate[4];
    requestedCovariates[0] = rgCov;
    requestedCovariates[1] = qsCov;
    requestedCovariates[2] = coCov;
    requestedCovariates[3] = cyCov;

    ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates);

    // check that the length is correct
    Assert.assertEquals(rc.getMismatchesKeySet().length, length);
    Assert.assertEquals(rc.getInsertionsKeySet().length, length);
    Assert.assertEquals(rc.getDeletionsKeySet().length, length);

    for (int i = 0; i < length; i++) {
      // check that read group is always the same
      Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), RGID);
      Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), RGID);
      Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), RGID);

      // check quality score
      Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]);
      Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]);
      Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]);

      // check context
          ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE));
          ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE));
          ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE));

      // check cycle
      Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i + 1));
      Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i + 1));
      Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i + 1));
 protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
   return (double) read.getMappingQuality();
Пример #19
  public T traverse(
      final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) {
    logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));

    final LocusView locusView = getLocusView(walker, dataProvider);
    final GenomeLocSortedSet initialIntervals = engine.getIntervals();

    final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider);
    final int activeRegionExtension =
    final int maxRegionSize =

    if (locusView
        .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing
                      // here at all
      int minStart = Integer.MAX_VALUE;
      ActivityProfile profile =
          new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions());

      ReferenceOrderedView referenceOrderedDataView =
          getReferenceOrderedView(walker, dataProvider, locusView);

      // We keep processing while the next reference location is within the interval
      GenomeLoc prevLoc = null;
      while (locusView.hasNext()) {
        final AlignmentContext locus = locusView.next();
        GenomeLoc location = locus.getLocation();

        if (prevLoc != null) {
          // fill in the active / inactive labels from the stop of the previous location to the
          // start of this location
          // TODO refactor to separate function
          for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) {
            final GenomeLoc fakeLoc =
                engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
            if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) {
                  new ActivityProfileResult(
                              && walker.presetActiveRegions.overlaps(fakeLoc)
                          ? 1.0
                          : 0.0));


        // create reference context. Note that if we have a pileup of "extended events", the context
        // will
        // hold the (longest) stretch of deleted reference bases (if deletions are present in the
        // pileup).
        final ReferenceContext refContext = referenceView.getReferenceContext(location);

        // Iterate forward to get all reference ordered data covering this location
        final RefMetaDataTracker tracker =
                locus.getLocation(), refContext);

        // Call the walkers isActive function for this locus and add them to the list to be
        // integrated later
        if (initialIntervals == null || initialIntervals.overlaps(location)) {
          profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));

        // Grab all the previously unseen reads from this pileup and add them to the massive read
        // list
        for (final PileupElement p : locus.getBasePileup()) {
          final GATKSAMRecord read = p.getRead();
          if (!myReads.contains(read)) {

          // If this is the last pileup for this shard calculate the minimum alignment start so that
          // we know
          // which active regions in the work queue are now safe to process
          minStart = Math.min(minStart, read.getAlignmentStart());

        prevLoc = location;



      // Take the individual isActive calls and integrate them into contiguous active regions and
      // add these blocks of work to the work queue
      // band-pass filter the list of isActive probabilities and turn into active regions
      final ActivityProfile bandPassFiltered = profile.bandPassFilter();
      final List<ActiveRegion> activeRegions =
          bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize);

      // add active regions to queue of regions to process
      // first check if can merge active regions over shard boundaries
      if (!activeRegions.isEmpty()) {
        if (!workQueue.isEmpty()) {
          final ActiveRegion last = workQueue.getLast();
          final ActiveRegion first = activeRegions.get(0);
          if (last.isActive == first.isActive
              && last.getLocation().contiguousP(first.getLocation())
              && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) {
                new ActiveRegion(

          "Integrated "
              + profile.size()
              + " isActive calls into "
              + activeRegions.size()
              + " regions.");

      // now go and process all of the active regions
      sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());

    return sum;
Пример #20
  private double scoreReadAgainstHaplotype(
      final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) {
    double expected = 0.0;
    double mismatches = 0.0;

    // What's the expected mismatch rate under the model that this read is actually sampled from
    // this haplotype?  Let's assume the consensus base c is a random choice one of A, C, G, or T,
    // and that
    // the observed base is actually from a c with an error rate e.  Since e is the rate at which
    // we'd
    // see a miscalled c, the expected mismatch rate is really e.  So the expected number of
    // mismatches
    // is just sum_i e_i for i from 1..n for n sites
    // Now, what's the probabilistic sum of mismatches?  Suppose that the base b is equal to c.
    // Well, it could
    // actually be a miscall in a matching direction, which would happen at a e / 3 rate.  If b !=
    // c, then
    // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be
    // a mismatch.
    // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1
    // ... n
    final byte[] haplotypeBases = haplotype.getBases();
    final GATKSAMRecord read = p.getRead();
    byte[] readBases = read.getReadBases();

    readBases =
            p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
            readQuals); // Shift the location of the qual scores based on the Cigar string
    int readOffsetFromPileup = p.getOffset();
    readOffsetFromPileup =
            p.getRead().getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
      if (baseOffset >= readBases.length) {

      final byte haplotypeBase = haplotypeBases[i];
      final byte readBase = readBases[baseOffset];

      final boolean matched =
          (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD);
      byte qual = readQuals[baseOffset];
      if (qual == PileupElement.DELETION_BASE) {
        qual = PileupElement.DELETION_QUAL;
      } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions
      qual = (byte) Math.min((int) qual, p.getMappingQual());
      if (((int) qual)
          >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning
                  // behind them
        final double e = QualityUtils.qualToErrorProb(qual);
        expected += e;
        mismatches += matched ? e : 1.0 - e / 3.0;

      // a more sophisticated calculation would include the reference quality, but it's nice to
      // actually penalize
      // the mismatching of poorly determined regions of the consensus

    return mismatches - expected;
Пример #21
   * Hard clips away soft clipped bases that are below the given quality threshold
   * @param read the read
   * @param minQual the mininum base quality score to revert the base (inclusive)
   * @return a new read without low quality soft clipped bases
  public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) {
    int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart();
    if (read.isEmpty() || nLeadingSoftClips > read.getReadLength())
      return GATKSAMRecord.emptyRead(read);

    byte[] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION);
    int left = -1;

    if (nLeadingSoftClips > 0) {
      for (int i = nLeadingSoftClips - 1; i >= 0; i--) {
        if (quals[i] >= minQual) left = i;
        else break;

    int right = -1;
    int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd();
    if (nTailingSoftClips > 0) {
      for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength(); i++) {
        if (quals[i] >= minQual) right = i;
        else break;

    GATKSAMRecord clippedRead = read;
    if (right >= 0
        && right + 1
            < clippedRead
                .getReadLength()) // only clip if there are softclipped bases (right >= 0) and the
                                  // first high quality soft clip is not the last base (right+1 <
                                  // readlength)
    clippedRead =
              right + 1,
                  - 1); // first we hard clip the low quality soft clips on the right tail
    if (left >= 0
        && left - 1
            > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality
                 // soft clip is not the last base (left-1 > 0)
    clippedRead =
              left - 1); // then we hard clip the low quality soft clips on the left tail

    return clippedRead;