Ejemplo n.º 1
0
  private Haplotype getHaplotypeFromRead(
      final PileupElement p, final int contextSize, final int locus) {
    final GATKSAMRecord read = p.getRead();
    int readOffsetFromPileup = p.getOffset();

    final byte[] haplotypeBases = new byte[contextSize];
    Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD);
    final double[] baseQualities = new double[contextSize];
    Arrays.fill(baseQualities, 0.0);

    byte[] readBases = read.getReadBases();
    readBases =
        AlignmentUtils.readToAlignmentByteArray(
            read.getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
        AlignmentUtils.readToAlignmentByteArray(
            read.getCigar(),
            readQuals); // Shift the location of the qual scores based on the Cigar string

    readOffsetFromPileup =
        AlignmentUtils.calcAlignmentByteArrayOffset(
            read.getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
        continue;
      }
      if (baseOffset >= readBases.length) {
        break;
      }
      if (readQuals[baseOffset] == PileupElement.DELETION_BASE) {
        readQuals[baseOffset] = PileupElement.DELETION_QUAL;
      }
      if (!BaseUtils.isRegularBase(readBases[baseOffset])) {
        readBases[baseOffset] = (byte) REGEXP_WILDCARD;
        readQuals[baseOffset] = (byte) 0;
      } // N's shouldn't be treated as distinct bases
      readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual());
      if (((int) readQuals[baseOffset]) < 5) {
        readQuals[baseOffset] = (byte) 0;
      } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind
        // them
      haplotypeBases[i] = readBases[baseOffset];
      baseQualities[i] = (double) readQuals[baseOffset];
    }

    return new Haplotype(haplotypeBases, baseQualities);
  }
Ejemplo n.º 2
0
  /**
   * Hard clips away soft clipped bases that are below the given quality threshold
   *
   * @param read the read
   * @param minQual the mininum base quality score to revert the base (inclusive)
   * @return a new read without low quality soft clipped bases
   */
  public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) {
    int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart();
    if (read.isEmpty() || nLeadingSoftClips > read.getReadLength())
      return GATKSAMRecord.emptyRead(read);

    byte[] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION);
    int left = -1;

    if (nLeadingSoftClips > 0) {
      for (int i = nLeadingSoftClips - 1; i >= 0; i--) {
        if (quals[i] >= minQual) left = i;
        else break;
      }
    }

    int right = -1;
    int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd();
    if (nTailingSoftClips > 0) {
      for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength(); i++) {
        if (quals[i] >= minQual) right = i;
        else break;
      }
    }

    GATKSAMRecord clippedRead = read;
    if (right >= 0
        && right + 1
            < clippedRead
                .getReadLength()) // only clip if there are softclipped bases (right >= 0) and the
                                  // first high quality soft clip is not the last base (right+1 <
                                  // readlength)
    clippedRead =
          hardClipByReadCoordinates(
              clippedRead,
              right + 1,
              clippedRead.getReadLength()
                  - 1); // first we hard clip the low quality soft clips on the right tail
    if (left >= 0
        && left - 1
            > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality
                 // soft clip is not the last base (left-1 > 0)
    clippedRead =
          hardClipByReadCoordinates(
              clippedRead,
              0,
              left - 1); // then we hard clip the low quality soft clips on the left tail

    return clippedRead;
  }
Ejemplo n.º 3
0
  /**
   * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the
   * desired algorithm.
   *
   * <p>This function will look for low quality tails and hard clip them away. A low quality tail
   * ends when a base has base quality greater than lowQual.
   *
   * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...)
   * @param lowQual every base quality lower than or equal to this in the tail of the read will be
   *     hard clipped
   * @return a new read without low quality tails
   */
  private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) {
    if (read.isEmpty()) return read;

    final byte[] quals = read.getBaseQualities();
    final int readLength = read.getReadLength();
    int leftClipIndex = 0;
    int rightClipIndex = readLength - 1;

    // check how far we can clip both sides
    while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--;
    while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++;

    // if the entire read should be clipped, then return an empty read.
    if (leftClipIndex > rightClipIndex) return GATKSAMRecord.emptyRead(read);

    if (rightClipIndex < readLength - 1) {
      this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1));
    }
    if (leftClipIndex > 0) {
      this.addOp(new ClippingOp(0, leftClipIndex - 1));
    }
    return this.clipRead(algorithm);
  }
Ejemplo n.º 4
0
  @Test(enabled = false)
  public void testCovariateGeneration() {
    final String RGID = "id";
    final int length = 10;
    final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
    GATKSAMRecord read = ReadUtils.createRandomRead(length, false);
    GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID);
    rg.setPlatform("illumina");
    read.setReadGroup(rg);
    final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION);
    final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION);
    final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION);

    ReadGroupCovariate rgCov = new ReadGroupCovariate();
    QualityScoreCovariate qsCov = new QualityScoreCovariate();
    ContextCovariate coCov = new ContextCovariate();
    CycleCovariate cyCov = new CycleCovariate();

    rgCov.initialize(RAC);
    qsCov.initialize(RAC);
    coCov.initialize(RAC);
    cyCov.initialize(RAC);

    Covariate[] requestedCovariates = new Covariate[4];
    requestedCovariates[0] = rgCov;
    requestedCovariates[1] = qsCov;
    requestedCovariates[2] = coCov;
    requestedCovariates[3] = cyCov;

    ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates);

    // check that the length is correct
    Assert.assertEquals(rc.getMismatchesKeySet().length, length);
    Assert.assertEquals(rc.getInsertionsKeySet().length, length);
    Assert.assertEquals(rc.getDeletionsKeySet().length, length);

    for (int i = 0; i < length; i++) {
      // check that read group is always the same
      Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), RGID);
      Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), RGID);
      Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), RGID);

      // check quality score
      Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]);
      Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]);
      Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]);

      // check context
      Assert.assertEquals(
          coCov.formatKey(rc.getMismatchesKeySet(i)[2]),
          ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE));
      Assert.assertEquals(
          coCov.formatKey(rc.getInsertionsKeySet(i)[2]),
          ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE));
      Assert.assertEquals(
          coCov.formatKey(rc.getDeletionsKeySet(i)[2]),
          ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE));

      // check cycle
      Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i + 1));
      Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i + 1));
      Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i + 1));
    }
  }
Ejemplo n.º 5
0
  private double scoreReadAgainstHaplotype(
      final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) {
    double expected = 0.0;
    double mismatches = 0.0;

    // What's the expected mismatch rate under the model that this read is actually sampled from
    // this haplotype?  Let's assume the consensus base c is a random choice one of A, C, G, or T,
    // and that
    // the observed base is actually from a c with an error rate e.  Since e is the rate at which
    // we'd
    // see a miscalled c, the expected mismatch rate is really e.  So the expected number of
    // mismatches
    // is just sum_i e_i for i from 1..n for n sites
    //
    // Now, what's the probabilistic sum of mismatches?  Suppose that the base b is equal to c.
    // Well, it could
    // actually be a miscall in a matching direction, which would happen at a e / 3 rate.  If b !=
    // c, then
    // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be
    // a mismatch.
    // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1
    // ... n
    final byte[] haplotypeBases = haplotype.getBases();
    final GATKSAMRecord read = p.getRead();
    byte[] readBases = read.getReadBases();

    readBases =
        AlignmentUtils.readToAlignmentByteArray(
            p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
        AlignmentUtils.readToAlignmentByteArray(
            p.getRead().getCigar(),
            readQuals); // Shift the location of the qual scores based on the Cigar string
    int readOffsetFromPileup = p.getOffset();
    readOffsetFromPileup =
        AlignmentUtils.calcAlignmentByteArrayOffset(
            p.getRead().getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
        continue;
      }
      if (baseOffset >= readBases.length) {
        break;
      }

      final byte haplotypeBase = haplotypeBases[i];
      final byte readBase = readBases[baseOffset];

      final boolean matched =
          (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD);
      byte qual = readQuals[baseOffset];
      if (qual == PileupElement.DELETION_BASE) {
        qual = PileupElement.DELETION_QUAL;
      } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions
      qual = (byte) Math.min((int) qual, p.getMappingQual());
      if (((int) qual)
          >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning
                  // behind them
        final double e = QualityUtils.qualToErrorProb(qual);
        expected += e;
        mismatches += matched ? e : 1.0 - e / 3.0;
      }

      // a more sophisticated calculation would include the reference quality, but it's nice to
      // actually penalize
      // the mismatching of poorly determined regions of the consensus
    }

    return mismatches - expected;
  }