Example #1
0
  private int[][][] getFullHashes(Sequence seq, int subKmerSize) {
    int cutoff =
        (int)
            ((long) Integer.MIN_VALUE
                + ((long) Integer.MAX_VALUE - (long) Integer.MIN_VALUE) / (long) REDUCTION);

    // compute just direct hash of sequence
    int[] hashes = Utils.computeSequenceHashes(seq.getString(), subKmerSize);

    int count = 0;
    for (int val : hashes) if (val <= cutoff) count++;

    SortableIntPair[] completeHashAsPair = new SortableIntPair[count];
    count = 0;
    for (int iter = 0; iter < hashes.length; iter++)
      if (hashes[iter] <= cutoff) {
        completeHashAsPair[count] = new SortableIntPair(hashes[iter], iter);
        count++;
      }

    // sort the results, sort is in place so no need to look at second
    Arrays.sort(completeHashAsPair);

    return storeAsArray(completeHashAsPair);
  }
Example #2
0
  public OverlapInfo getFullScore(OrderKmerHashes s, double maxShiftPercent) {
    int[][][] allKmerHashes = this.orderedHashes;

    // get the kmers of the second sequence
    int[][][] sAllKmerHashes = s.orderedHashes;

    // get sizes
    int size1 = this.size();
    int size2 = s.size();

    int kmerSize1 = this.seqLength;
    int kmerSize2 = s.seqLength;

    // init the ok regions
    int valid1Lower = 0;
    int valid1Upper = kmerSize1;
    int valid2Lower = 0;
    int valid2Upper = kmerSize2;

    int medianShift = 0;
    int overlapSize = Math.min(kmerSize1, kmerSize2);
    int absMaxShiftInOverlap = Math.max(kmerSize1, kmerSize2);

    int count = 0;
    int[] posShift = new int[Math.min(size1, size2) / 8 + 1];
    int[] pos1Index = new int[posShift.length];
    int[] pos2Index = new int[posShift.length];

    // check the repeat flag
    int numScoringRepeats = 2;
    if (maxShiftPercent <= 0) {
      numScoringRepeats = 1;
      maxShiftPercent = Math.abs(maxShiftPercent);
    }

    // refine multiple times to get better interval estimate
    for (int repeat = 0; repeat < numScoringRepeats; repeat++) {
      // init counters
      count = 0;
      int ii1 = 0;
      int ii2 = 0;
      int i1 = 0;
      int i2 = 0;

      // init the loop storage
      int hash1 = 0;
      int hash2 = 0;
      int pos1;
      int pos2;

      // perform merge operation to get the shift and the kmer count
      while (true) {
        if (i1 >= allKmerHashes[ii1].length) {
          ii1++;
          i1 = 0;

          // break if reached end
          if (ii1 >= allKmerHashes.length) break;
        }
        if (i2 >= sAllKmerHashes[ii2].length) {
          ii2++;
          i2 = 0;

          // break if reached end
          if (ii2 >= sAllKmerHashes.length) break;
        }

        // get the values in the array
        hash1 = allKmerHashes[ii1][i1][0];
        pos1 = allKmerHashes[ii1][i1][1];

        hash2 = sAllKmerHashes[ii2][i2][0];
        pos2 = sAllKmerHashes[ii2][i2][1];

        if (hash1 < hash2 || pos1 < valid1Lower || pos1 >= valid1Upper) i1++;
        else if (hash2 < hash1 || pos2 < valid2Lower || pos2 >= valid2Upper) i2++;
        else {
          // check if current shift makes sense positionally
          int currShift = pos2 - pos1;
          if (Math.abs(currShift - medianShift) > absMaxShiftInOverlap) {
            // do not record this shift and increase counter
            i2++;
            continue;
          }

          // adjust array size if needed
          if (posShift.length <= count) {
            posShift = Arrays.copyOf(posShift, posShift.length * 2);
            pos1Index = Arrays.copyOf(pos1Index, pos1Index.length * 2);
            pos2Index = Arrays.copyOf(pos2Index, pos2Index.length * 2);
          }

          // compute the shift
          posShift[count] = currShift;
          pos1Index[count] = pos1;
          pos2Index[count] = pos2;

          // if first round, store only first hit
          if (repeat == 0) i1++;
          i2++;

          count++;
        }
      }

      if (count <= 0) return new OverlapInfo(0.0, 0, 0, 0, 0, 0);

      // pick out only the matches that are best
      if (repeat > 0) {
        int reducedCount = -1;

        // copy over only the best values
        for (int iter = 0; iter < count; iter++) {
          if (reducedCount >= 0 && pos1Index[reducedCount] == pos1Index[iter]) {
            // if better, record it
            if (Math.abs(posShift[reducedCount] - medianShift)
                > Math.abs(posShift[iter] - medianShift)) {
              pos1Index[reducedCount] = pos1Index[iter];
              pos2Index[reducedCount] = pos2Index[iter];
              posShift[reducedCount] = posShift[iter];
            }
          } else {
            // add the new data
            reducedCount++;
            pos1Index[reducedCount] = pos1Index[iter];
            pos2Index[reducedCount] = pos2Index[iter];
            posShift[reducedCount] = posShift[iter];
          }
        }

        count = reducedCount + 1;
      }

      if (count <= 0) medianShift = 0;
      else medianShift = Utils.quickSelect(Arrays.copyOf(posShift, count), count / 2, count);

      // get the actual overlap size
      int leftPosition = Math.max(0, -medianShift);
      int rightPosition = Math.min(kmerSize1, kmerSize2 - medianShift);
      overlapSize = Math.max(this.seqLength - kmerSize1, rightPosition - leftPosition);

      // compute the max possible allowed shift in kmers
      absMaxShiftInOverlap =
          Math.min(Math.max(kmerSize1, kmerSize2), (int) ((double) overlapSize * maxShiftPercent));

      // get the updated borders
      valid1Lower = Math.max(0, -medianShift - absMaxShiftInOverlap);
      valid1Upper = Math.min(kmerSize1, kmerSize2 - medianShift + absMaxShiftInOverlap);
      valid2Lower = Math.max(0, medianShift - absMaxShiftInOverlap);
      valid2Upper = Math.min(kmerSize2, kmerSize1 + medianShift + absMaxShiftInOverlap);

      /*
      System.err.println(overlapSize);
      System.err.println("Size1= "+size1+" Lower:"+
      valid1Lower+" Upper:"+valid1Upper+" Shift="+shift);
      System.err.println("Size2= "+size2+" Lower:"+
      valid2Lower+" Upper:"+valid2Upper);
      */
    }

    // storage for edge computation
    int leftEdge1 = Integer.MAX_VALUE;
    int leftEdge2 = Integer.MAX_VALUE;
    int rightEdge1 = Integer.MIN_VALUE;
    int rightEdge2 = Integer.MIN_VALUE;

    // count only the shifts in the correct place
    int validCount = 0;
    for (int iter = 0; iter < count; iter++) {
      int pos1 = pos1Index[iter];
      int pos2 = pos2Index[iter];

      // take only valid values
      if (Math.abs(posShift[iter] - medianShift) > absMaxShiftInOverlap) continue;

      // get the edges
      if (pos1 < leftEdge1) leftEdge1 = pos1;
      if (pos2 < leftEdge2) leftEdge2 = pos2;
      if (pos1 > rightEdge1) rightEdge1 = pos1;
      if (pos2 > rightEdge2) rightEdge2 = pos2;

      validCount++;
    }

    if (validCount <= 1) return new OverlapInfo(0.0, 0, 0, 0, 0, 0);

    // compute the score
    double score = (double) validCount / (double) (overlapSize);

    // get edge info  uniformly minimum variance unbiased (UMVU) estimators
    // a = (n*a-b)/(n-1)
    // b = (n*b-a)/(n-1)
    int a1 =
        Math.max(
            0, (int) Math.round((validCount * leftEdge1 - rightEdge1) / (double) (validCount - 1)));
    int a2 =
        Math.max(
            0, (int) Math.round((validCount * leftEdge2 - rightEdge2) / (double) (validCount - 1)));
    int b1 =
        Math.min(
            this.seqLength,
            (int) Math.round((validCount * rightEdge1 - leftEdge1) / (double) (validCount - 1)));
    int b2 =
        Math.min(
            s.seqLength,
            (int) Math.round((validCount * rightEdge2 - leftEdge2) / (double) (validCount - 1)));

    // int ahang = a1-a2;
    // int bhang = (this.size()-b1>s.size()-b2) ? b1-this.size() : s.size() - b2;

    // if (score>0.06)
    // {
    //	int[] test = Arrays.copyOf(posShift, count);
    //	int[] test2 = Arrays.copyOf(pos1Index, count);

    //	System.err.println("Start = "+Math.max(0, -medianShift)+", Overlap="+overlapSize+"
    // Maxshift="+absMaxShiftInOverlap+": ["+Arrays.toString(test)+";
    // "+Arrays.toString(test2)+"];");
    //	System.err.println("Overlap="+overlapSize+",
    // Shift/overlap="+(double)(test[test.length-10]-test[10])/(double)overlapSize);
    // }

    // the hangs are adjusted by the rate of slide*distance traveled relative to median,
    // -medianShift-(a1-a2)
    // return new OverlapInfo(score, ahang, bhang);

    return new OverlapInfo(score * (double) REDUCTION, validCount, a1, a2, b1, b2);
  }