예제 #1
0
파일: LowMQ.java 프로젝트: raj76/gatk
  public Map<String, Object> annotate(
      RefMetaDataTracker tracker,
      AnnotatorCompatibleWalker walker,
      ReferenceContext ref,
      Map<String, AlignmentContext> stratifiedContexts,
      VariantContext vc) {
    if (stratifiedContexts.size() == 0) return null;

    double mq0 = 0;
    double mq10 = 0;
    double total = 0;
    for (Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet()) {
      if (!sample.getValue().hasBasePileup()) continue;

      for (PileupElement p : sample.getValue().getBasePileup()) {
        if (p.getMappingQual() == 0) {
          mq0 += 1;
        }
        if (p.getMappingQual() <= 10) {
          mq10 += 1;
        }
        total += 1;
      }
    }
    Map<String, Object> map = new HashMap<String, Object>();
    map.put(
        getKeyNames().get(0), String.format("%.04f,%.04f,%.00f", mq0 / total, mq10 / total, total));
    return map;
  }
예제 #2
0
  private Haplotype getHaplotypeFromRead(
      final PileupElement p, final int contextSize, final int locus) {
    final GATKSAMRecord read = p.getRead();
    int readOffsetFromPileup = p.getOffset();

    final byte[] haplotypeBases = new byte[contextSize];
    Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD);
    final double[] baseQualities = new double[contextSize];
    Arrays.fill(baseQualities, 0.0);

    byte[] readBases = read.getReadBases();
    readBases =
        AlignmentUtils.readToAlignmentByteArray(
            read.getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
        AlignmentUtils.readToAlignmentByteArray(
            read.getCigar(),
            readQuals); // Shift the location of the qual scores based on the Cigar string

    readOffsetFromPileup =
        AlignmentUtils.calcAlignmentByteArrayOffset(
            read.getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
        continue;
      }
      if (baseOffset >= readBases.length) {
        break;
      }
      if (readQuals[baseOffset] == PileupElement.DELETION_BASE) {
        readQuals[baseOffset] = PileupElement.DELETION_QUAL;
      }
      if (!BaseUtils.isRegularBase(readBases[baseOffset])) {
        readBases[baseOffset] = (byte) REGEXP_WILDCARD;
        readQuals[baseOffset] = (byte) 0;
      } // N's shouldn't be treated as distinct bases
      readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual());
      if (((int) readQuals[baseOffset]) < 5) {
        readQuals[baseOffset] = (byte) 0;
      } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind
        // them
      haplotypeBases[i] = readBases[baseOffset];
      baseQualities[i] = (double) readQuals[baseOffset];
    }

    return new Haplotype(haplotypeBases, baseQualities);
  }
  private Map<String, Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {

    if (!stratifiedContext.hasBasePileup()) return null;

    HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
    for (Allele allele : vc.getAlternateAlleles()) alleleCounts.put(allele.getBases()[0], 0);

    ReadBackedPileup pileup = stratifiedContext.getBasePileup();
    int totalDepth = pileup.size();

    Map<String, Object> map = new HashMap<String, Object>();
    map.put(getKeyNames().get(0), totalDepth); // put total depth in right away

    if (totalDepth == 0) return map; // done, can not compute FA at 0 coverage!!

    int mq0 = 0; // number of "ref" reads that are acually mq0
    for (PileupElement p : pileup) {
      if (p.getMappingQual() == 0) {
        mq0++;
        continue;
      }
      if (alleleCounts.containsKey(p.getBase())) // non-mq0 read and it's an alt
      alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1);
    }

    if (mq0 == totalDepth) return map; // if all reads are mq0, there is nothing left to do

    // we need to add counts in the correct order
    String[] fracs = new String[alleleCounts.size()];
    for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
      fracs[i] =
          String.format(
              "%.3f",
              ((float) alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))
                  / (totalDepth - mq0));
    }

    map.put(getKeyNames().get(1), fracs);
    return map;
  }
예제 #4
0
  private double scoreReadAgainstHaplotype(
      final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) {
    double expected = 0.0;
    double mismatches = 0.0;

    // What's the expected mismatch rate under the model that this read is actually sampled from
    // this haplotype?  Let's assume the consensus base c is a random choice one of A, C, G, or T,
    // and that
    // the observed base is actually from a c with an error rate e.  Since e is the rate at which
    // we'd
    // see a miscalled c, the expected mismatch rate is really e.  So the expected number of
    // mismatches
    // is just sum_i e_i for i from 1..n for n sites
    //
    // Now, what's the probabilistic sum of mismatches?  Suppose that the base b is equal to c.
    // Well, it could
    // actually be a miscall in a matching direction, which would happen at a e / 3 rate.  If b !=
    // c, then
    // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be
    // a mismatch.
    // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1
    // ... n
    final byte[] haplotypeBases = haplotype.getBases();
    final GATKSAMRecord read = p.getRead();
    byte[] readBases = read.getReadBases();

    readBases =
        AlignmentUtils.readToAlignmentByteArray(
            p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string
    byte[] readQuals = read.getBaseQualities();
    readQuals =
        AlignmentUtils.readToAlignmentByteArray(
            p.getRead().getCigar(),
            readQuals); // Shift the location of the qual scores based on the Cigar string
    int readOffsetFromPileup = p.getOffset();
    readOffsetFromPileup =
        AlignmentUtils.calcAlignmentByteArrayOffset(
            p.getRead().getCigar(), p, read.getAlignmentStart(), locus);
    final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2;

    for (int i = 0; i < contextSize; i++) {
      final int baseOffset = i + baseOffsetStart;
      if (baseOffset < 0) {
        continue;
      }
      if (baseOffset >= readBases.length) {
        break;
      }

      final byte haplotypeBase = haplotypeBases[i];
      final byte readBase = readBases[baseOffset];

      final boolean matched =
          (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD);
      byte qual = readQuals[baseOffset];
      if (qual == PileupElement.DELETION_BASE) {
        qual = PileupElement.DELETION_QUAL;
      } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions
      qual = (byte) Math.min((int) qual, p.getMappingQual());
      if (((int) qual)
          >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning
                  // behind them
        final double e = QualityUtils.qualToErrorProb(qual);
        expected += e;
        mismatches += matched ? e : 1.0 - e / 3.0;
      }

      // a more sophisticated calculation would include the reference quality, but it's nice to
      // actually penalize
      // the mismatching of poorly determined regions of the consensus
    }

    return mismatches - expected;
  }