Ejemplo n.º 1
0
/** Created with IntelliJ IDEA. User: bradt Date: 6/11/13 */
public class ArrayLoglessPairHMM extends PairHMM {
  private static final double INITIAL_CONDITION = Math.pow(2, 1020);
  private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);

  // we divide e by 3 because the observed base could have come from any of the non-observed alleles
  protected static final double TRISTATE_CORRECTION = 3.0;

  protected double[][] transition = null; // The transition probabilities cache
  protected double[][] prior = null; // The prior probabilities cache

  // Array declarations for arrays implementation
  private double[] currentMatchArray = null;
  private double[] currentDeleteArray = null;
  private double[] currentInsertArray = null;
  private double[] parentMatchArray = null;
  private double[] parentDeleteArray = null;
  private double[] parentInsertArray = null;
  private double[] grandparentMatchArray = null;
  private double[] grandparentDeleteArray = null;
  private double[] grandparentInsertArray = null;

  // When successive haplotypes have a common prefix, these arrays store cached info from the
  // previous haplotype; for reading
  private double[] matchCacheArray = null;
  private double[] deleteCacheArray = null;
  private double[] insertCacheArray = null;

  // These arrays store cache info for use with the next haplotype; for writing
  private double[] nextMatchCacheArray = null;
  private double[] nextDeleteCacheArray = null;
  private double[] nextInsertCacheArray = null;

  // Used when caching to store our intermediate sum at point of first difference bw successive
  // haplotypes
  private double partialSum;

  /** {@inheritDoc} */
  @Override
  public void initialize(final int readMaxLength, final int haplotypeMaxLength) {
    super.initialize(readMaxLength, haplotypeMaxLength);

    transition = PairHMMModel.createTransitionMatrix(maxReadLength);
    prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];

    // Initialize all arrays
    // Final Cell of array is a padding cell, initialized to zero.
    currentMatchArray = new double[paddedMaxReadLength];
    currentDeleteArray = new double[paddedMaxReadLength];
    currentInsertArray = new double[paddedMaxReadLength];

    parentMatchArray = new double[paddedMaxReadLength];
    parentDeleteArray = new double[paddedMaxReadLength];
    parentInsertArray = new double[paddedMaxReadLength];

    grandparentMatchArray = new double[paddedMaxReadLength];
    grandparentDeleteArray = new double[paddedMaxReadLength];
    grandparentInsertArray = new double[paddedMaxReadLength];

    // Initialize the special arrays used for caching when successive haplotypes have a common
    // prefix
    matchCacheArray = new double[paddedMaxReadLength];
    deleteCacheArray = new double[paddedMaxReadLength];
    insertCacheArray = new double[paddedMaxReadLength];

    nextMatchCacheArray = new double[paddedMaxReadLength];
    nextDeleteCacheArray = new double[paddedMaxReadLength];
    nextInsertCacheArray = new double[paddedMaxReadLength];
  }

  /** {@inheritDoc} */
  @Override
  public double subComputeReadLikelihoodGivenHaplotypeLog10(
      final byte[] haplotypeBases,
      final byte[] readBases,
      final byte[] readQuals,
      final byte[] insertionGOP,
      final byte[] deletionGOP,
      final byte[] overallGCP,
      int hapStartIndex,
      final boolean recacheReadValues,
      final int nextHapStartIndex) {

    if (!constantsAreInitialized) {
      initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);

      // note that we initialized the constants
      constantsAreInitialized = true;
    }
    initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);

    // Some housekeeping to be done if we are starting a new read
    if (recacheReadValues) {
      hapStartIndex = 0;

      initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);
      // note that we initialized the constants
      constantsAreInitialized = true;

      // Read length may have changed, so we need to set zero-value padding at the appropriate
      // position.
      padMatchAndInsertArrays(readBases.length);
    }

    // if we have not cached from a previous haplotype, clear any info we may have accumulated in a
    // previous HMM iteration
    if (hapStartIndex == 0) {
      clearPreviouslyCachedInfo(readBases.length);

      // Haplotype length may have changed, so we need to set initial-value padding at the
      // appropriate position.
      padDeleteArrays(haplotypeBases.length, readBases.length);
    }

    // We build up our solution by looking at position [0] in the match, insert arrays. Need to set
    // these to 0 before we start.
    clearArraySolutionPosition();

    // Some parameters to control behavior during the dynamic programming loop
    final int maxDiagonals =
        readBases.length
            + haplotypeBases.length
            - hapStartIndex
            - 1; // Number of diagonals for a matrix  = rows + cols - 1;
    int startFill; // The lower bound of the array indices we want to over-write
    int endFill; // The upper bound of the array indices we want to over-write
    final int cacheSumIndex =
        nextHapStartIndex
            - hapStartIndex
            + readBases.length
            - 1; // This array will contain the partial sum to cache for the next haplotype
    double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction

    // Perform dynamic programming using arrays, as if over diagonals of a hypothetical
    // read/haplotype alignment matrix
    for (int i = 1; i <= maxDiagonals; i++) {
      // set the bounds for cells we wish to fill in the arrays
      startFill = Math.max(readBases.length - i, 0);
      endFill = Math.min(maxDiagonals - i + 1, readBases.length);

      // apply any previously cached array information
      if (i <= readBases.length) applyPreviouslyCachedInfo(startFill);

      // fill in the cells for our current arrays
      updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i);

      // final probability is the log10 sum of the last element in the Match and Insertion state
      // arrays
      // this way we ignore all paths that ended in deletions! (huge)
      // but we have to sum all the paths ending in the M and I arrays, because they're no longer
      // extended.
      // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype]
      // matrix. Before this, they carries the 0's we set above.
      finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0];

      // Partial sum for caching the next haplotype:
      // At the position of the last similar base between this haplotype and the next one...
      // ...remember the partial sum, so that we can start here on the next hap.
      if (i == cacheSumIndex) partialSum = finalArraySumProbabilities;

      rotateArrayReferences();
    }
    // The cache arrays we wrote for this haplotype will be read for the next haplotype.
    rotateCacheArrays();

    // return result
    return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10;
  }

  /**
   * Initializes the matrix that holds all the constants related to the editing distance between the
   * read and the haplotype.
   *
   * @param haplotypeBases the bases of the haplotype
   * @param readBases the bases of the read
   * @param readQuals the base quality scores of the read
   * @param startIndex where to start updating the distanceMatrix (in case this read is similar to
   *     the previous read)
   */
  public void initializePriors(
      final byte[] haplotypeBases,
      final byte[] readBases,
      final byte[] readQuals,
      final int startIndex) {

    // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
    // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and
    // columns below 2.

    for (int i = 0; i < readBases.length; i++) {
      final byte x = readBases[i];
      final byte qual = readQuals[i];
      for (int j = startIndex; j < haplotypeBases.length; j++) {
        final byte y = haplotypeBases[j];
        prior[i + 1][j + 1] =
            (x == y || x == (byte) 'N' || y == (byte) 'N'
                ? QualityUtils.qualToProb(qual)
                : (QualityUtils.qualToErrorProb(qual)
                    / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)));
      }
    }
  }

  /**
   * Initializes the matrix that holds all the constants related to quality scores.
   *
   * @param insertionGOP insertion quality scores of the read
   * @param deletionGOP deletion quality scores of the read
   * @param overallGCP overall gap continuation penalty
   */
  @Requires({"insertionGOP != null", "deletionGOP != null", "overallGCP != null"})
  @Ensures("constantsAreInitialized")
  protected static void initializeProbabilities(
      final double[][] transition,
      final byte[] insertionGOP,
      final byte[] deletionGOP,
      final byte[] overallGCP) {
    PairHMMModel.qualToTransProbs(transition, insertionGOP, deletionGOP, overallGCP);
  }

  /**
   * Pad the ends of the Match and Insert arrays with 0. Analogous to setting zeros in the first row
   * in the Match, Insert matrices of N2MemoryPairHMM.
   *
   * @param padPosition Which index in the arrays we wish to pad
   */
  private void padMatchAndInsertArrays(final int padPosition) {
    grandparentMatchArray[padPosition] = 0;
    grandparentInsertArray[padPosition] = 0;
    parentMatchArray[padPosition] = 0;
    parentInsertArray[padPosition] = 0;
    currentMatchArray[padPosition] = 0;
    currentInsertArray[padPosition] = 0;
    matchCacheArray[padPosition] = 0;
    insertCacheArray[padPosition] = 0;
    nextMatchCacheArray[padPosition] = 0;
    nextInsertCacheArray[padPosition] = 0;
  }

  /**
   * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of
   * the alignment. Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM.
   *
   * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial
   *     padding value
   * @param padPosition Which index in the arrays we wish to pad
   */
  private void padDeleteArrays(final int haplotypeLength, final int padPosition) {
    final double initialValue = INITIAL_CONDITION / haplotypeLength;

    // Pad the deletion arrays. Akin to padding the first row in the deletion matrix
    parentDeleteArray[padPosition] = initialValue;
    grandparentDeleteArray[padPosition] = initialValue;
    currentDeleteArray[padPosition] = initialValue;
    deleteCacheArray[padPosition] = initialValue;
    nextDeleteCacheArray[padPosition] = initialValue;
  }

  /**
   * We build up our solution by looking at position [0] in the match, insert arrays. Need to set
   * these to 0 before we start.
   */
  private void clearArraySolutionPosition() {
    grandparentMatchArray[0] = 0;
    grandparentInsertArray[0] = 0;
    parentMatchArray[0] = 0;
    parentInsertArray[0] = 0;
    currentMatchArray[0] = 0;
    currentInsertArray[0] = 0;
  }

  /**
   * Clears cached information saved from the last haplotype, allowing us to start at the beginning
   * of the present haplotype with intitial values of 0.
   *
   * @param fillLength How much of the cache arrays do we need to zero
   */
  private void clearPreviouslyCachedInfo(final int fillLength) {
    Arrays.fill(matchCacheArray, 0, fillLength, 0);
    Arrays.fill(deleteCacheArray, 0, fillLength, 0);
    Arrays.fill(insertCacheArray, 0, fillLength, 0);

    partialSum = 0;
  }

  /**
   * Applies cached information saved from the last haplotype, allowing us to start in the middle of
   * the present haplotype.
   *
   * @param indK the index in the arrays we wish to update with cached info
   */
  private void applyPreviouslyCachedInfo(int indK) {
    // apply caching info necessary for calculating current DELETE array values
    parentMatchArray[indK] = matchCacheArray[indK];
    parentDeleteArray[indK] = deleteCacheArray[indK];

    // apply caching info necessary for calculating current MATCH array values
    grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1];
    grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1];
    grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1];
  }

  /**
   * Records the mid-process state of one location in the read/haplotype alignment. Writes new cache
   * information for use with the next haplotype we see.
   *
   * @param indK the index in the cache arrays we wish to store information in
   */
  private void recordNewCacheInfo(int indK) {
    nextMatchCacheArray[indK] = currentMatchArray[indK];
    nextDeleteCacheArray[indK] = currentDeleteArray[indK];
    nextInsertCacheArray[indK] = currentInsertArray[indK];
  }

  /**
   * Update the HMM arrays for the current diagonal.
   *
   * @param readLength The length of the read
   * @param hapStartIndex An offset that tells us if we are starting in the middle of the present
   *     haplotype
   * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to
   *     look at to record new caching info
   * @param startFill The lower bound of the array indices we want to over-write
   * @param endFill The upper bound of the array indices we want to over-write
   * @param iii The index indicating which diagonal of the read/haplotype alignment we are working
   *     on
   */
  private void updateArrays(
      final int readLength,
      final int hapStartIndex,
      final int nextHapStartIndex,
      final int startFill,
      final int endFill,
      final int iii) {

    // The coordinate in our priors and transition matrices corresponding to a given position in the
    // read/haplotype alignment
    int matrixRow;
    int matrixCol;

    int arrayIndex;
    for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) {
      // translate the array position into a row, column in the priors and transition matrices
      matrixRow = readLength - arrayIndex - 1;
      matrixCol = iii - matrixRow - 1 + hapStartIndex;

      // update cell for each of our current arrays. Prior, transition matrices are padded +1
      // row,col
      updateArrayCell(arrayIndex, prior[matrixRow + 1][matrixCol + 1], transition[matrixRow + 1]);

      // Set up caching for the next haplotype
      // At the position of the final similar base between this haplotype and the next one, remember
      // the mid-array values
      if (matrixCol == nextHapStartIndex - 1) recordNewCacheInfo(arrayIndex);
    }
  }

  /**
   * Updates a cell in the HMM arrays
   *
   * @param indK index in the arrays to update
   * @param prior the likelihood editing distance matrix for the read x haplotype
   * @param transition an array with the six transition relevant to this location
   */
  private void updateArrayCell(final int indK, final double prior, final double[] transition) {
    currentMatchArray[indK] =
        prior
            * (grandparentMatchArray[indK + 1] * transition[matchToMatch]
                + grandparentInsertArray[indK + 1] * transition[indelToMatch]
                + grandparentDeleteArray[indK + 1] * transition[indelToMatch]);
    currentInsertArray[indK] =
        parentMatchArray[indK + 1] * transition[matchToInsertion]
            + parentInsertArray[indK + 1] * transition[insertionToInsertion];
    currentDeleteArray[indK] =
        parentMatchArray[indK] * transition[matchToDeletion]
            + parentDeleteArray[indK] * transition[deletionToDeletion];
  }

  /**
   * To prepare for the next diagonal in our loop, each array must be bumped to an older generation
   */
  private void rotateArrayReferences() {
    double[] tempMatchArray = grandparentMatchArray;
    double[] tempDeleteArray = grandparentDeleteArray;
    double[] tempInsertArray = grandparentInsertArray;

    grandparentMatchArray = parentMatchArray;
    grandparentDeleteArray = parentDeleteArray;
    grandparentInsertArray = parentInsertArray;

    parentMatchArray = currentMatchArray;
    parentDeleteArray = currentDeleteArray;
    parentInsertArray = currentInsertArray;

    currentMatchArray = tempMatchArray;
    currentDeleteArray = tempDeleteArray;
    currentInsertArray = tempInsertArray;
  }

  /**
   * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read
   * arrays
   */
  private void rotateCacheArrays() {
    matchCacheArray = nextMatchCacheArray.clone();
    deleteCacheArray = nextDeleteCacheArray.clone();
    insertCacheArray = nextInsertCacheArray.clone();
  }
}
Ejemplo n.º 2
0
  /** {@inheritDoc} */
  @Override
  public double subComputeReadLikelihoodGivenHaplotypeLog10(
      final byte[] haplotypeBases,
      final byte[] readBases,
      final byte[] readQuals,
      final byte[] insertionGOP,
      final byte[] deletionGOP,
      final byte[] overallGCP,
      int hapStartIndex,
      final boolean recacheReadValues,
      final int nextHapStartIndex) {

    if (!constantsAreInitialized) {
      initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);

      // note that we initialized the constants
      constantsAreInitialized = true;
    }
    initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);

    // Some housekeeping to be done if we are starting a new read
    if (recacheReadValues) {
      hapStartIndex = 0;

      initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);
      // note that we initialized the constants
      constantsAreInitialized = true;

      // Read length may have changed, so we need to set zero-value padding at the appropriate
      // position.
      padMatchAndInsertArrays(readBases.length);
    }

    // if we have not cached from a previous haplotype, clear any info we may have accumulated in a
    // previous HMM iteration
    if (hapStartIndex == 0) {
      clearPreviouslyCachedInfo(readBases.length);

      // Haplotype length may have changed, so we need to set initial-value padding at the
      // appropriate position.
      padDeleteArrays(haplotypeBases.length, readBases.length);
    }

    // We build up our solution by looking at position [0] in the match, insert arrays. Need to set
    // these to 0 before we start.
    clearArraySolutionPosition();

    // Some parameters to control behavior during the dynamic programming loop
    final int maxDiagonals =
        readBases.length
            + haplotypeBases.length
            - hapStartIndex
            - 1; // Number of diagonals for a matrix  = rows + cols - 1;
    int startFill; // The lower bound of the array indices we want to over-write
    int endFill; // The upper bound of the array indices we want to over-write
    final int cacheSumIndex =
        nextHapStartIndex
            - hapStartIndex
            + readBases.length
            - 1; // This array will contain the partial sum to cache for the next haplotype
    double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction

    // Perform dynamic programming using arrays, as if over diagonals of a hypothetical
    // read/haplotype alignment matrix
    for (int i = 1; i <= maxDiagonals; i++) {
      // set the bounds for cells we wish to fill in the arrays
      startFill = Math.max(readBases.length - i, 0);
      endFill = Math.min(maxDiagonals - i + 1, readBases.length);

      // apply any previously cached array information
      if (i <= readBases.length) applyPreviouslyCachedInfo(startFill);

      // fill in the cells for our current arrays
      updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i);

      // final probability is the log10 sum of the last element in the Match and Insertion state
      // arrays
      // this way we ignore all paths that ended in deletions! (huge)
      // but we have to sum all the paths ending in the M and I arrays, because they're no longer
      // extended.
      // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype]
      // matrix. Before this, they carries the 0's we set above.
      finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0];

      // Partial sum for caching the next haplotype:
      // At the position of the last similar base between this haplotype and the next one...
      // ...remember the partial sum, so that we can start here on the next hap.
      if (i == cacheSumIndex) partialSum = finalArraySumProbabilities;

      rotateArrayReferences();
    }
    // The cache arrays we wrote for this haplotype will be read for the next haplotype.
    rotateCacheArrays();

    // return result
    return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10;
  }