@Test public void testBamIntegers() throws IOException { final List<String> errorMessages = new ArrayList<String>(); final SamReader bamReader = SamReaderFactory.makeDefault().open(BAM_INPUT); final File bamOutput = File.createTempFile("test", ".bam"); final File samOutput = File.createTempFile("test", ".sam"); final SAMFileWriter samWriter = new SAMFileWriterFactory().makeWriter(bamReader.getFileHeader(), true, samOutput, null); final SAMFileWriter bamWriter = new SAMFileWriterFactory().makeWriter(bamReader.getFileHeader(), true, bamOutput, null); final SAMRecordIterator iterator = bamReader.iterator(); while (iterator.hasNext()) { try { final SAMRecord rec = iterator.next(); samWriter.addAlignment(rec); bamWriter.addAlignment(rec); } catch (final Throwable e) { System.out.println(e.getMessage()); errorMessages.add(e.getMessage()); } } CloserUtil.close(bamReader); samWriter.close(); bamWriter.close(); Assert.assertEquals(errorMessages.size(), 0); bamOutput.deleteOnExit(); samOutput.deleteOnExit(); }
@Test( description = "Read and then write SAM to verify header attribute ordering does not change depending on JVM version") public void samRoundTrip() throws Exception { final File input = new File(TEST_DATA_DIR, "roundtrip.sam"); final SamReader reader = SamReaderFactory.makeDefault().open(input); final File outputFile = File.createTempFile("roundtrip-out", ".sam"); outputFile.delete(); outputFile.deleteOnExit(); FileOutputStream os = new FileOutputStream(outputFile); final SAMFileWriterFactory factory = new SAMFileWriterFactory(); final SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, os); for (SAMRecord rec : reader) { writer.addAlignment(rec); } writer.close(); os.close(); InputStream is = new FileInputStream(input); String originalsam = IOUtil.readFully(is); is.close(); is = new FileInputStream(outputFile); String writtensam = IOUtil.readFully(is); is.close(); Assert.assertEquals(writtensam, originalsam); }
@Test(description = "Write SAM records with null SAMFileHeader") public void samNullHeaderRoundTrip() throws Exception { final File input = new File(TEST_DATA_DIR, "roundtrip.sam"); final SamReader reader = SamReaderFactory.makeDefault().open(input); final File outputFile = File.createTempFile("nullheader-out", ".sam"); outputFile.delete(); outputFile.deleteOnExit(); FileOutputStream os = new FileOutputStream(outputFile); final SAMFileWriterFactory factory = new SAMFileWriterFactory(); final SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, os); for (SAMRecord rec : reader) { rec.setHeader(null); writer.addAlignment(rec); } writer.close(); os.close(); InputStream is = new FileInputStream(input); String originalsam = IOUtil.readFully(is); is.close(); is = new FileInputStream(outputFile); String writtensam = IOUtil.readFully(is); is.close(); Assert.assertEquals(writtensam, originalsam); }
@Test(dataProvider = "data") public void testProgramGroupAndReadGroupMerge(File inputFiles[], File expectedOutputFile) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(expectedOutputFile)); String line; String expected_output = ""; while ((line = reader.readLine()) != null) { expected_output += line + "\n"; } final List<SAMFileReader> readers = new ArrayList<SAMFileReader>(); final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); for (final File inFile : inputFiles) { IOUtil.assertFileIsReadable(inFile); final SAMFileReader in = new SAMFileReader(inFile); // We are now checking for zero-length reads, so suppress complaint about that. in.setValidationStringency(ValidationStringency.SILENT); readers.add(in); headers.add(in.getFileHeader()); } final MergingSamRecordIterator iterator; final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); iterator = new MergingSamRecordIterator(headerMerger, readers, false); ByteArrayOutputStream baos = new ByteArrayOutputStream(); SAMFileWriter writer = new SAMFileWriterFactory().makeSAMWriter(headerMerger.getMergedHeader(), true, baos); while (iterator.hasNext()) { writer.addAlignment(iterator.next()); } writer.close(); String actual_output = StringUtil.bytesToString(baos.toByteArray()); List<String> actual = Arrays.asList(actual_output.split("\\n")); List<String> expected = Arrays.asList(expected_output.split("\\n")); for (int i = 0; i < expected.size(); i++) { if (expected.get(i).startsWith("@")) { Assert.assertTrue(headersEquivalent(actual.get(i), expected.get(i))); } else { List<String> expectedSamParts = Arrays.asList(expected.get(i).split("\\s*")); List<String> actualSamParts = Arrays.asList(actual.get(i).split("\\s*")); for (String exp : expectedSamParts) { Assert.assertTrue(actualSamParts.contains(exp)); } for (String act : actualSamParts) { Assert.assertTrue(expectedSamParts.contains(act)); } } } }
private void createSmallBam(final File outputFile) { final SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(true); factory.setCreateMd5File(true); final SAMFileHeader header = new SAMFileHeader(); // index only created if coordinate sorted header.setSortOrder(SAMFileHeader.SortOrder.coordinate); header.addSequence(new SAMSequenceRecord("chr1", 123)); final SAMFileWriter writer = factory.makeBAMWriter(header, false, outputFile); fillSmallBam(writer); writer.close(); }
private void createSmallBamToOutputStream(final OutputStream outputStream, boolean binary) { final SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(false); factory.setCreateMd5File(false); final SAMFileHeader header = new SAMFileHeader(); // index only created if coordinate sorted header.setSortOrder(SAMFileHeader.SortOrder.coordinate); header.addSequence(new SAMSequenceRecord("chr1", 123)); final SAMFileWriter writer = (binary ? factory.makeBAMWriter(header, false, outputStream) : factory.makeSAMWriter(header, false, outputStream)); fillSmallBam(writer); writer.close(); }
private void createBAM(final List<GATKSAMRecord> reads) throws IOException { testBAM = File.createTempFile("TraverseActiveRegionsUnitTest", ".bam"); testBAM.deleteOnExit(); SAMFileWriter out = new SAMFileWriterFactory() .setCreateIndex(true) .makeBAMWriter(reads.get(0).getHeader(), true, testBAM); for (GATKSAMRecord read : reads) { out.addAlignment(read); } out.close(); new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); }
protected int doWork() { IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); final SAMFileReader in = new SAMFileReader(INPUT); // create the read group we'll be using final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); rg.setLibrary(RGLB); rg.setPlatform(RGPL); rg.setSample(RGSM); rg.setPlatformUnit(RGPU); if (RGCN != null) rg.setSequencingCenter(RGCN); if (RGDS != null) rg.setDescription(RGDS); if (RGDT != null) rg.setRunDate(RGDT); log.info( String.format( "Created read group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample())); // create the new header and output file final SAMFileHeader inHeader = in.getFileHeader(); final SAMFileHeader outHeader = inHeader.clone(); outHeader.setReadGroups(Arrays.asList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); final SAMFileWriter outWriter = new SAMFileWriterFactory() .makeSAMOrBAMWriter( outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { read.setAttribute(SAMTag.RG.name(), RGID); outWriter.addAlignment(read); progress.record(read); } // cleanup in.close(); outWriter.close(); return 0; }
public void close() { for (SAMFileWriter w : writerMap.values()) w.close(); }
protected void PicardPreprocess( Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, QualityException, IOException, URISyntaxException { outHeader = header.clone(); outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); // tmp files String tmpOut1 = tmpFileBase + "-p1.bam"; String tmpOut2 = tmpFileBase + "-p2.bam"; String tmpOut3 = tmpFileBase + "-p3.sam"; String fCounts = tmpFileBase + "-features.count"; String tmpMetrics = tmpFileBase + "-p3-metrics.txt"; SAMFileWriterFactory factory = new SAMFileWriterFactory(); if (!inputIsBam) { outHeader.addReadGroup(bamrg); } SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1)); long startTime = System.currentTimeMillis(); int count = 0; SAMRecord sam; while (input.hasNext()) { sam = input.next(); writer.addAlignment(sam); count++; } int reads = input.getCount(); writer.close(); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); long estimatedTime = System.currentTimeMillis() - startTime; context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime); Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000); Logger.DEBUG("clean sam"); context.setStatus("clean sam"); tools.runCleanSam(tmpOut1, tmpOut2); Logger.DEBUG("mark duplicates"); context.setStatus("mark duplicates"); tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics); if (gff != null) { // tmpOut3 is sam for htseq count! Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, tmpOut3, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS( context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } if (!inputIsBam) { Logger.DEBUG("add read-group"); context.setStatus("add read-group"); tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM); } else { context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(tmpOut3, output, threads); } Logger.DEBUG("build bam index"); context.setStatus("build bam index"); tools.runBuildBamIndex(output); estimatedTime = System.currentTimeMillis() - startTime; Logger.DEBUG("estimated time: " + estimatedTime / 1000); // remove all temporary files now! HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }