/** Ensure that splitting read groups still works when dealing with null read groups. */ @Test public void testSplitByNullReadGroups() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10); ReadBackedPileup pileup = new ReadBackedPileupImpl(null, Arrays.asList(read1, read2, read3), Arrays.asList(1, 1, 1)); ReadBackedPileup nullRgPileup = pileup.getPileupForReadGroup(null); List<GATKSAMRecord> nullRgReads = nullRgPileup.getReads(); Assert.assertEquals( nullRgPileup.getNumberOfElements(), 3, "Wrong number of reads in null read group"); Assert.assertEquals( nullRgReads.get(0), read1, "Read " + read1.getReadName() + " should be in null rg but isn't"); Assert.assertEquals( nullRgReads.get(1), read2, "Read " + read2.getReadName() + " should be in null rg but isn't"); Assert.assertEquals( nullRgReads.get(2), read3, "Read " + read3.getReadName() + " should be in null rg but isn't"); ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1"); Assert.assertNull(rg1Pileup, "Pileup for non-existent read group should return null"); }
/** Ensure that splitting read groups still works when dealing with a sample-split pileup. */ @Test public void testSplitBySample() { SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); readGroupOne.setSample("sample1"); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); readGroupTwo.setSample("sample2"); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10); read3.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10); read4.setAttribute("RG", readGroupTwo.getId()); ReadBackedPileupImpl sample1Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read1, read3), Arrays.asList(1, 1)); ReadBackedPileupImpl sample2Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read2, read4), Arrays.asList(1, 1)); Map<String, ReadBackedPileupImpl> sampleToPileupMap = new HashMap<String, ReadBackedPileupImpl>(); sampleToPileupMap.put(readGroupOne.getSample(), sample1Pileup); sampleToPileupMap.put(readGroupTwo.getSample(), sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null, sampleToPileupMap); ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1"); List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1"); Assert.assertEquals( rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2"); List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2"); Assert.assertEquals( rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't"); }
public double[] computeReadHaplotypeLikelihoods( ReadBackedPileup pileup, HashMap<Allele, Haplotype> haplotypesInVC) { double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; int i = 0; for (GATKSAMRecord read : pileup.getReads()) { if (ReadUtils.is454Read(read)) { continue; } // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent int j = 0; for (Map.Entry<Allele, Haplotype> a : haplotypesInVC.entrySet()) { readLikelihoods[i][j] = computeReadLikelihoodGivenHaplotype(a.getValue(), read); if (DEBUG) { System.out.print(read.getReadName() + " "); System.out.format( "%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n", i, j, read.getAlignmentStart(), read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), read.getCigarString(), readLikelihoods[i][j]); } j++; } i++; } for (i = 0; i < haplotypesInVC.size(); i++) { for (int j = i; j < haplotypesInVC.size(); j++) { // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) // readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) double[] readLikelihood = new double[2]; // diploid sample for (int readIdx = 0; readIdx < pileup.getReads().size(); readIdx++) { readLikelihood[0] = -readLikelihoods[readIdx][i] / 10; readLikelihood[1] = -readLikelihoods[readIdx][j] / 10; // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. // Second term is a constant added to both likelihoods so will be ignored haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]); } } } return getHaplotypeLikelihoods(haplotypeLikehoodMatrix); }
/** Ensure that basic read group splitting works. */ @Test public void testSplitByReadGroup() { SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10); read3.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10); read4.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read5 = ArtificialSAMUtils.createArtificialRead(header, "read5", 0, 1, 10); read5.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read6 = ArtificialSAMUtils.createArtificialRead(header, "read6", 0, 1, 10); read6.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read7 = ArtificialSAMUtils.createArtificialRead(header, "read7", 0, 1, 10); read7.setAttribute("RG", readGroupOne.getId()); ReadBackedPileup pileup = new ReadBackedPileupImpl( null, Arrays.asList(read1, read2, read3, read4, read5, read6, read7), Arrays.asList(1, 1, 1, 1, 1, 1, 1)); ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1"); List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 4, "Wrong number of reads in read group rg1"); Assert.assertEquals( rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(2), read6, "Read " + read6.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(3), read7, "Read " + read7.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = pileup.getPileupForReadGroup("rg2"); List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg2Reads.size(), 3, "Wrong number of reads in read group rg2"); Assert.assertEquals( rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(2), read5, "Read " + read5.getReadName() + " should be in rg2 but isn't"); }
/** * Clips the bases in read according to this operation's start and stop. Uses the clipping * representation used is the one provided by algorithm argument. * * @param algorithm clipping algorithm to use * @param originalRead the read to be clipped */ public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { GATKSAMRecord read = (GATKSAMRecord) originalRead.clone(); byte[] quals = read.getBaseQualities(); byte[] bases = read.getReadBases(); byte[] newBases = new byte[bases.length]; byte[] newQuals = new byte[quals.length]; switch (algorithm) { // important note: // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 // because you're not guaranteed to get a pointer to the actual array of bytes in the // GATKSAMRecord case WRITE_NS: for (int i = 0; i < bases.length; i++) { if (i >= start && i <= stop) { newBases[i] = 'N'; } else { newBases[i] = bases[i]; } } read.setReadBases(newBases); break; case WRITE_Q0S: for (int i = 0; i < quals.length; i++) { if (i >= start && i <= stop) { newQuals[i] = 0; } else { newQuals[i] = quals[i]; } } read.setBaseQualities(newQuals); break; case WRITE_NS_Q0S: for (int i = 0; i < bases.length; i++) { if (i >= start && i <= stop) { newQuals[i] = 0; newBases[i] = 'N'; } else { newQuals[i] = quals[i]; newBases[i] = bases[i]; } } read.setBaseQualities(newBases); read.setReadBases(newBases); break; case HARDCLIP_BASES: read = hardClip(read, start, stop); break; case SOFTCLIP_BASES: if (read.getReadUnmappedFlag()) { // we can't process unmapped reads throw new UserException("Read Clipper cannot soft clip unmapped reads"); } // System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); int myStop = stop; if ((stop + 1 - start) == read.getReadLength()) { // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it // alone // Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be // represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); // break; myStop--; // just decrement stop } if (start > 0 && myStop != read.getReadLength() - 1) throw new RuntimeException( String.format( "Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); Cigar oldCigar = read.getCigar(); int scLeft = 0, scRight = read.getReadLength(); if (start == 0) scLeft = myStop + 1; else scRight = start; Cigar newCigar = softClip(oldCigar, scLeft, scRight); read.setCigar(newCigar); int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); int newStart = read.getAlignmentStart() + newClippedStart; read.setAlignmentStart(newStart); break; case REVERT_SOFTCLIPPED_BASES: read = revertSoftClippedBases(read); break; default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } return read; }