/** Ensure that splitting read groups still works when dealing with a sample-split pileup. */ @Test public void testSplitBySample() { SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); readGroupOne.setSample("sample1"); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); readGroupTwo.setSample("sample2"); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10); read3.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10); read4.setAttribute("RG", readGroupTwo.getId()); ReadBackedPileupImpl sample1Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read1, read3), Arrays.asList(1, 1)); ReadBackedPileupImpl sample2Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read2, read4), Arrays.asList(1, 1)); Map<String, ReadBackedPileupImpl> sampleToPileupMap = new HashMap<String, ReadBackedPileupImpl>(); sampleToPileupMap.put(readGroupOne.getSample(), sample1Pileup); sampleToPileupMap.put(readGroupTwo.getSample(), sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null, sampleToPileupMap); ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1"); List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1"); Assert.assertEquals( rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2"); List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2"); Assert.assertEquals( rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't"); }
@Test public void testGetPileupForSample() { String sample1 = "sample1"; String sample2 = "sample2"; SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); readGroupOne.setSample(sample1); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); readGroupTwo.setSample(sample2); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); Map<String, ReadBackedPileupImpl> sampleToPileupMap = new HashMap<String, ReadBackedPileupImpl>(); sampleToPileupMap.put( sample1, new ReadBackedPileupImpl(null, Collections.singletonList(read1), 0)); sampleToPileupMap.put( sample2, new ReadBackedPileupImpl(null, Collections.singletonList(read2), 0)); ReadBackedPileup pileup = new ReadBackedPileupImpl(null, sampleToPileupMap); ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2); Assert.assertEquals( sample2Pileup.getNumberOfElements(), 1, "Sample 2 pileup has wrong number of elements"); Assert.assertEquals( sample2Pileup.getReads().get(0), read2, "Sample 2 pileup has incorrect read"); ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing"); Assert.assertNull(missingSamplePileup, "Pileup for sample 'missing' should be null but isn't"); missingSamplePileup = pileup.getPileupForSample("not here"); Assert.assertNull(missingSamplePileup, "Pileup for sample 'not here' should be null but isn't"); }
public void setupTest2( final int ID, final String readGroupId, final SAMReadGroupRecord readGroupRecord, final String sample, final String library, final SAMFileHeader header, final SAMRecordSetBuilder setBuilder) throws IOException { final String separator = ":"; final int contig1 = 0; final int contig2 = 1; final int contig3 = 2; readGroupRecord.setSample(sample); readGroupRecord.setPlatform(platform); readGroupRecord.setLibrary(library); readGroupRecord.setPlatformUnit(readGroupId); setBuilder.setReadGroup(readGroupRecord); setBuilder.setUseNmFlag(true); setBuilder.setHeader(header); final int max = 800; final int min = 1; final Random rg = new Random(5); // add records that align to all 3 chr in reference file for (int i = 0; i < NUM_READS; i++) { final int start = rg.nextInt(max) + min; final String newReadName = READ_NAME + separator + ID + separator + i; if (i <= NUM_READS / 3) { setBuilder.addPair(newReadName, contig1, start + ID, start + ID + LENGTH); } else if (i < (NUM_READS - (NUM_READS / 3))) { setBuilder.addPair(newReadName, contig2, start + ID, start + ID + LENGTH); } else { setBuilder.addPair(newReadName, contig3, start + ID, start + ID + LENGTH); } } }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }