@Test(dataProvider = "data") public void testProgramGroupAndReadGroupMerge(File inputFiles[], File expectedOutputFile) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(expectedOutputFile)); String line; String expected_output = ""; while ((line = reader.readLine()) != null) { expected_output += line + "\n"; } final List<SAMFileReader> readers = new ArrayList<SAMFileReader>(); final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); for (final File inFile : inputFiles) { IOUtil.assertFileIsReadable(inFile); final SAMFileReader in = new SAMFileReader(inFile); // We are now checking for zero-length reads, so suppress complaint about that. in.setValidationStringency(ValidationStringency.SILENT); readers.add(in); headers.add(in.getFileHeader()); } final MergingSamRecordIterator iterator; final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); iterator = new MergingSamRecordIterator(headerMerger, readers, false); ByteArrayOutputStream baos = new ByteArrayOutputStream(); SAMFileWriter writer = new SAMFileWriterFactory().makeSAMWriter(headerMerger.getMergedHeader(), true, baos); while (iterator.hasNext()) { writer.addAlignment(iterator.next()); } writer.close(); String actual_output = StringUtil.bytesToString(baos.toByteArray()); List<String> actual = Arrays.asList(actual_output.split("\\n")); List<String> expected = Arrays.asList(expected_output.split("\\n")); for (int i = 0; i < expected.size(); i++) { if (expected.get(i).startsWith("@")) { Assert.assertTrue(headersEquivalent(actual.get(i), expected.get(i))); } else { List<String> expectedSamParts = Arrays.asList(expected.get(i).split("\\s*")); List<String> actualSamParts = Arrays.asList(actual.get(i).split("\\s*")); for (String exp : expectedSamParts) { Assert.assertTrue(actualSamParts.contains(exp)); } for (String act : actualSamParts) { Assert.assertTrue(expectedSamParts.contains(act)); } } } }
@Test(expectedExceptions = {SAMException.class}) public void testUnmergeableSequenceDictionary() { final String sd1 = sq1 + sq2 + sq5; final String sd2 = sq2 + sq3 + sq4 + sq1; SAMFileReader reader1 = new SAMFileReader(new ByteArrayInputStream(StringUtil.stringToBytes(sd1))); SAMFileReader reader2 = new SAMFileReader(new ByteArrayInputStream(StringUtil.stringToBytes(sd2))); final List<SAMFileHeader> inputHeaders = Arrays.asList(reader1.getFileHeader(), reader2.getFileHeader()); new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, inputHeaders, true); }
/** * tests that if we've set the merging to false, we get a SAMException for bam's with different * dictionaries. */ @Test(expectedExceptions = SequenceUtil.SequenceListsDifferException.class) public void testMergedException() { File INPUT[] = { new File(TEST_DATA_DIR, "SamFileHeaderMergerTest/Chromosome1to10.bam"), new File(TEST_DATA_DIR, "SamFileHeaderMergerTest/Chromosome5to9.bam") }; final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); for (final File inFile : INPUT) { IOUtil.assertFileIsReadable(inFile); final SAMFileReader in = new SAMFileReader(inFile); headers.add(in.getFileHeader()); } new SamFileHeaderMerger(SAMFileHeader.SortOrder.unsorted, headers, false); }
/** * Generates a BAM index file from an input BAM file * * @param reader SAMFileReader for input BAM file * @param output File for output index file */ public static void createIndex(SAMFileReader reader, File output, Log log) { BAMIndexer indexer = new BAMIndexer(output, reader.getFileHeader()); reader.enableFileSource(true); int totalRecords = 0; // create and write the content for (SAMRecord rec : reader) { if (++totalRecords % 1000000 == 0) { if (null != log) log.info(totalRecords + " reads processed ..."); } indexer.processAlignment(rec); } indexer.finish(); }
/** Tests that we can successfully merge two files with */ @Test public void testMerging() { File INPUT[] = { new File(TEST_DATA_DIR, "SamFileHeaderMergerTest/Chromosome1to10.bam"), new File(TEST_DATA_DIR, "SamFileHeaderMergerTest/Chromosome5to9.bam") }; final List<SAMFileReader> readers = new ArrayList<SAMFileReader>(); final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); for (final File inFile : INPUT) { IOUtil.assertFileIsReadable(inFile); final SAMFileReader in = new SAMFileReader(inFile); // We are now checking for zero-length reads, so suppress complaint about that. in.setValidationStringency(ValidationStringency.SILENT); readers.add(in); headers.add(in.getFileHeader()); } final MergingSamRecordIterator iterator; final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.unsorted, headers, true); iterator = new MergingSamRecordIterator(headerMerger, readers, false); headerMerger.getMergedHeader(); // count the total reads, and record read counts for each sequence Map<Integer, Integer> seqCounts = new HashMap<Integer, Integer>(); int totalCount = 0; while (iterator.hasNext()) { SAMRecord r = iterator.next(); if (seqCounts.containsKey(r.getReferenceIndex())) { seqCounts.put(r.getReferenceIndex(), seqCounts.get(r.getReferenceIndex()) + 1); } else { seqCounts.put(r.getReferenceIndex(), 1); } ++totalCount; } assertEquals(totalCount, 1500); for (Integer i : seqCounts.keySet()) { if (i < 4 || i > 8) { // seqeunce 5 - 9 should have 200 reads (indices 4 - 8) assertEquals(seqCounts.get(i).intValue(), 100); } else { // the others should have 100 assertEquals(seqCounts.get(i).intValue(), 200); } } }
protected int doWork() { IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); final SAMFileReader in = new SAMFileReader(INPUT); // create the read group we'll be using final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); rg.setLibrary(RGLB); rg.setPlatform(RGPL); rg.setSample(RGSM); rg.setPlatformUnit(RGPU); if (RGCN != null) rg.setSequencingCenter(RGCN); if (RGDS != null) rg.setDescription(RGDS); if (RGDT != null) rg.setRunDate(RGDT); log.info( String.format( "Created read group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample())); // create the new header and output file final SAMFileHeader inHeader = in.getFileHeader(); final SAMFileHeader outHeader = inHeader.clone(); outHeader.setReadGroups(Arrays.asList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); final SAMFileWriter outWriter = new SAMFileWriterFactory() .makeSAMOrBAMWriter( outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { read.setAttribute(SAMTag.RG.name(), RGID); outWriter.addAlignment(read); progress.record(read); } // cleanup in.close(); outWriter.close(); return 0; }
private int countAlignmentsInWindow( int reference, int window, SAMFileReader reader, int expectedCount) { final int SIXTEEN_K = 1 << 14; // 1 << LinearIndex.BAM_LIDX_SHIFT final int start = window >> 14; // window * SIXTEEN_K; final int stop = ((window + 1) >> 14) - 1; // (window + 1 * SIXTEEN_K) - 1; final String chr = reader.getFileHeader().getSequence(reference).getSequenceName(); // get records for the entire linear index window SAMRecordIterator iter = reader.queryOverlapping(chr, start, stop); SAMRecord rec; int count = 0; while (iter.hasNext()) { rec = iter.next(); count++; if (expectedCount == -1) System.err.println(rec.getReadName()); } iter.close(); return count; }
@Test public void testSequenceDictionaryMerge() { final String sd1 = sq1 + sq2 + sq5; final String sd2 = sq2 + sq3 + sq4; SAMFileReader reader1 = new SAMFileReader(new ByteArrayInputStream(StringUtil.stringToBytes(sd1))); SAMFileReader reader2 = new SAMFileReader(new ByteArrayInputStream(StringUtil.stringToBytes(sd2))); final List<SAMFileHeader> inputHeaders = Arrays.asList(reader1.getFileHeader(), reader2.getFileHeader()); SamFileHeaderMerger merger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, inputHeaders, true); final SAMFileHeader mergedHeader = merger.getMergedHeader(); for (final SAMFileHeader inputHeader : inputHeaders) { int prevTargetIndex = -1; for (final SAMSequenceRecord sequenceRecord : inputHeader.getSequenceDictionary().getSequences()) { final int targetIndex = mergedHeader.getSequenceIndex(sequenceRecord.getSequenceName()); Assert.assertNotSame(targetIndex, -1); Assert.assertTrue(prevTargetIndex < targetIndex); prevTargetIndex = targetIndex; } } }
public static void main(String[] argv) { SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT); // STFU SAMResource sr = null; SAMRegion region = new SAMRegion(); region.range = new Range(); region.range.start = -1; region.range.end = -1; String outfile = null; String target_file = null; SAMCoverage sc = new SAMCoverage(); for (int i = 0; i < argv.length; i++) { if (argv[i].equals("-bam")) { sr = new SAMResource(); // sr.import_data(SAMResourceTags.SAM_URL, argv[++i]); sr.set_file(argv[++i]); sr.detect_sample_id(); } else if (argv[i].equals("-targets")) { target_file = argv[++i]; } else if (argv[i].equals("-tname")) { region.tname = argv[++i]; } else if (argv[i].equals("-verbose")) { sc.set_verbose(true); } else if (argv[i].equals("-tstart")) { region.range.start = Integer.parseInt(argv[++i]); } else if (argv[i].equals("-tend")) { region.range.end = Integer.parseInt(argv[++i]); } else if (argv[i].equals("-of")) { outfile = argv[++i]; } else if (argv[i].equals("-min-quality")) { sc.set_min_quality(Integer.parseInt(argv[++i])); } else { System.err.println("error: unknown switch " + argv[i]); // debug System.exit(1); } } String error = null; if (sr == null) { error = "specify -bam [file]"; } else if (target_file == null) { if (region.tname == null) { error = "specify -tname"; } else if (region.range.start == -1) { error = "specify -tstart"; } else if (region.range.end == -1) { error = "specify -tend"; } } sr.set_region(region); if (error != null) { System.err.println("ERROR: " + error); // debug } else if (target_file != null) { try { File f = new File(target_file); BufferedReader br = new BufferedReader(new FileReader(f)); String line = br.readLine(); String[] headers = line.split("\t"); if (headers[0].equals("Name") && headers[1].equals("Chromosome") && headers[2].equals("Start") && headers[3].equals("End")) { WorkingFile wf = null; if (outfile != null) { wf = new WorkingFile(outfile); sc.setPrintStream(wf.getPrintStream()); } while (true) { line = br.readLine(); if (line == null) { // EOF break; } else { String[] row = line.split("\t"); region.tname = row[1]; region.range.start = Integer.parseInt(row[2]); region.range.end = Integer.parseInt(row[3]); // sc.set_name(new String(row[0])); sc.set_name(new String(row[0]) + "," + new String(row[1])); sc.find_coverage(sr); } } if (outfile != null) wf.finish(); } else { throw new IOException("file format error"); } } catch (Exception e) { System.err.println("ERROR: " + e); // debug e.printStackTrace(); System.exit(1); } } else { sc.set_outfile(outfile); sc.find_coverage(sr); } }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }