/** Ensure that splitting read groups still works when dealing with null read groups. */
  @Test
  public void testSplitByNullReadGroups() {
    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10);

    ReadBackedPileup pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read1, read2, read3), Arrays.asList(1, 1, 1));

    ReadBackedPileup nullRgPileup = pileup.getPileupForReadGroup(null);
    List<GATKSAMRecord> nullRgReads = nullRgPileup.getReads();
    Assert.assertEquals(
        nullRgPileup.getNumberOfElements(), 3, "Wrong number of reads in null read group");
    Assert.assertEquals(
        nullRgReads.get(0),
        read1,
        "Read " + read1.getReadName() + " should be in null rg but isn't");
    Assert.assertEquals(
        nullRgReads.get(1),
        read2,
        "Read " + read2.getReadName() + " should be in null rg but isn't");
    Assert.assertEquals(
        nullRgReads.get(2),
        read3,
        "Read " + read3.getReadName() + " should be in null rg but isn't");

    ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1");
    Assert.assertNull(rg1Pileup, "Pileup for non-existent read group should return null");
  }
  /** Ensure that splitting read groups still works when dealing with a sample-split pileup. */
  @Test
  public void testSplitBySample() {
    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    readGroupOne.setSample("sample1");
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");
    readGroupTwo.setSample("sample2");

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10);
    read3.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10);
    read4.setAttribute("RG", readGroupTwo.getId());

    ReadBackedPileupImpl sample1Pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read1, read3), Arrays.asList(1, 1));
    ReadBackedPileupImpl sample2Pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read2, read4), Arrays.asList(1, 1));
    Map<String, ReadBackedPileupImpl> sampleToPileupMap =
        new HashMap<String, ReadBackedPileupImpl>();
    sampleToPileupMap.put(readGroupOne.getSample(), sample1Pileup);
    sampleToPileupMap.put(readGroupTwo.getSample(), sample2Pileup);

    ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null, sampleToPileupMap);

    ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1");
    List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads();

    Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1");
    Assert.assertEquals(
        rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't");

    ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2");
    List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads();

    Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2");
    Assert.assertEquals(
        rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't");
  }
  public double[] computeReadHaplotypeLikelihoods(
      ReadBackedPileup pileup, HashMap<Allele, Haplotype> haplotypesInVC) {
    double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
    double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
    int i = 0;
    for (GATKSAMRecord read : pileup.getReads()) {
      if (ReadUtils.is454Read(read)) {
        continue;
      }
      // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
      // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
      int j = 0;
      for (Map.Entry<Allele, Haplotype> a : haplotypesInVC.entrySet()) {
        readLikelihoods[i][j] = computeReadLikelihoodGivenHaplotype(a.getValue(), read);
        if (DEBUG) {
          System.out.print(read.getReadName() + " ");

          System.out.format(
              "%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",
              i,
              j,
              read.getAlignmentStart(),
              read.getUnclippedStart(),
              read.getAlignmentEnd(),
              read.getUnclippedEnd(),
              read.getCigarString(),
              readLikelihoods[i][j]);
        }
        j++;
      }
      i++;
    }

    for (i = 0; i < haplotypesInVC.size(); i++) {
      for (int j = i; j < haplotypesInVC.size(); j++) {
        // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
        // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
        // readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
        double[] readLikelihood = new double[2]; // diploid sample
        for (int readIdx = 0; readIdx < pileup.getReads().size(); readIdx++) {
          readLikelihood[0] = -readLikelihoods[readIdx][i] / 10;
          readLikelihood[1] = -readLikelihoods[readIdx][j] / 10;

          // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
          // First term is approximated by Jacobian log with table lookup.
          // Second term is a constant added to both likelihoods so will be ignored
          haplotypeLikehoodMatrix[i][j] +=
              MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]);
        }
      }
    }

    return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
  }
  /** Ensure that basic read group splitting works. */
  @Test
  public void testSplitByReadGroup() {
    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10);
    read3.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10);
    read4.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read5 = ArtificialSAMUtils.createArtificialRead(header, "read5", 0, 1, 10);
    read5.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read6 = ArtificialSAMUtils.createArtificialRead(header, "read6", 0, 1, 10);
    read6.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read7 = ArtificialSAMUtils.createArtificialRead(header, "read7", 0, 1, 10);
    read7.setAttribute("RG", readGroupOne.getId());

    ReadBackedPileup pileup =
        new ReadBackedPileupImpl(
            null,
            Arrays.asList(read1, read2, read3, read4, read5, read6, read7),
            Arrays.asList(1, 1, 1, 1, 1, 1, 1));

    ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1");
    List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads();
    Assert.assertEquals(rg1Reads.size(), 4, "Wrong number of reads in read group rg1");
    Assert.assertEquals(
        rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(2), read6, "Read " + read6.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(3), read7, "Read " + read7.getReadName() + " should be in rg1 but isn't");

    ReadBackedPileup rg2Pileup = pileup.getPileupForReadGroup("rg2");
    List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads();
    Assert.assertEquals(rg2Reads.size(), 3, "Wrong number of reads in read group rg2");
    Assert.assertEquals(
        rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(2), read5, "Read " + read5.getReadName() + " should be in rg2 but isn't");
  }
Beispiel #5
0
  /**
   * Clips the bases in read according to this operation's start and stop. Uses the clipping
   * representation used is the one provided by algorithm argument.
   *
   * @param algorithm clipping algorithm to use
   * @param originalRead the read to be clipped
   */
  public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) {
    GATKSAMRecord read = (GATKSAMRecord) originalRead.clone();
    byte[] quals = read.getBaseQualities();
    byte[] bases = read.getReadBases();
    byte[] newBases = new byte[bases.length];
    byte[] newQuals = new byte[quals.length];

    switch (algorithm) {
        // important note:
        //   it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0
        //   because you're not guaranteed to get a pointer to the actual array of bytes in the
        // GATKSAMRecord
      case WRITE_NS:
        for (int i = 0; i < bases.length; i++) {
          if (i >= start && i <= stop) {
            newBases[i] = 'N';
          } else {
            newBases[i] = bases[i];
          }
        }
        read.setReadBases(newBases);
        break;
      case WRITE_Q0S:
        for (int i = 0; i < quals.length; i++) {
          if (i >= start && i <= stop) {
            newQuals[i] = 0;
          } else {
            newQuals[i] = quals[i];
          }
        }
        read.setBaseQualities(newQuals);
        break;
      case WRITE_NS_Q0S:
        for (int i = 0; i < bases.length; i++) {
          if (i >= start && i <= stop) {
            newQuals[i] = 0;
            newBases[i] = 'N';
          } else {
            newQuals[i] = quals[i];
            newBases[i] = bases[i];
          }
        }
        read.setBaseQualities(newBases);
        read.setReadBases(newBases);
        break;
      case HARDCLIP_BASES:
        read = hardClip(read, start, stop);
        break;

      case SOFTCLIP_BASES:
        if (read.getReadUnmappedFlag()) {
          // we can't process unmapped reads
          throw new UserException("Read Clipper cannot soft clip unmapped reads");
        }

        // System.out.printf("%d %d %d%n", stop, start, read.getReadLength());
        int myStop = stop;
        if ((stop + 1 - start) == read.getReadLength()) {
          // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it
          // alone
          // Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be
          // represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName()));
          // break;
          myStop--; // just decrement stop
        }

        if (start > 0 && myStop != read.getReadLength() - 1)
          throw new RuntimeException(
              String.format(
                  "Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d",
                  read.getReadName(), start, myStop));

        Cigar oldCigar = read.getCigar();

        int scLeft = 0, scRight = read.getReadLength();
        if (start == 0) scLeft = myStop + 1;
        else scRight = start;

        Cigar newCigar = softClip(oldCigar, scLeft, scRight);
        read.setCigar(newCigar);

        int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar);
        int newStart = read.getAlignmentStart() + newClippedStart;
        read.setAlignmentStart(newStart);

        break;

      case REVERT_SOFTCLIPPED_BASES:
        read = revertSoftClippedBases(read);
        break;

      default:
        throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
    }

    return read;
  }