private void createSmallBam(final File outputFile) { final SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(true); factory.setCreateMd5File(true); final SAMFileHeader header = new SAMFileHeader(); // index only created if coordinate sorted header.setSortOrder(SAMFileHeader.SortOrder.coordinate); header.addSequence(new SAMSequenceRecord("chr1", 123)); final SAMFileWriter writer = factory.makeBAMWriter(header, false, outputFile); fillSmallBam(writer); writer.close(); }
private void createSmallBamToOutputStream(final OutputStream outputStream, boolean binary) { final SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(false); factory.setCreateMd5File(false); final SAMFileHeader header = new SAMFileHeader(); // index only created if coordinate sorted header.setSortOrder(SAMFileHeader.SortOrder.coordinate); header.addSequence(new SAMSequenceRecord("chr1", 123)); final SAMFileWriter writer = (binary ? factory.makeBAMWriter(header, false, outputStream) : factory.makeSAMWriter(header, false, outputStream)); fillSmallBam(writer); writer.close(); }
/** * Construct an artificial SAM file reader with the given SAM file header * * @param customHeader Header that should be returned by calls to getFileHeader() on this reader * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMFileHeader customHeader, SAMRecord... reads) { super(createEmptyInputStream(), true); this.customHeader = customHeader; this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); this.reads = Arrays.asList(reads); }
/** * Returns the reference index in the given header of the contig of the read's mate, or {@link * SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped. * * @param read read whose mate's reference index to look up * @param header SAM header defining contig indices * @return the reference index in the given header of the contig of the read's mate, or {@link * SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped. */ public static int getMateReferenceIndex(final GATKRead read, final SAMFileHeader header) { if (read.mateIsUnmapped()) { return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } return header.getSequenceIndex(read.getMateContig()); }
public List<String> getSequenceNames() { if (sequenceNames == null) { SAMFileHeader header = getFileHeader(); if (header == null) { return null; } sequenceNames = new ArrayList(); List<SAMSequenceRecord> records = header.getSequenceDictionary().getSequences(); if (records.size() > 0) { for (SAMSequenceRecord rec : header.getSequenceDictionary().getSequences()) { String chr = rec.getSequenceName(); sequenceNames.add(chr); } } } return sequenceNames; }
/** * Check to ensure that the alignment makes sense based on the contents of the header. * * @param header The SAM file header. * @param read The read to verify. * @return true if alignment agrees with header, false otherwise. */ public static boolean alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read) { final int referenceIndex = getReferenceIndex(read, header); // Read is aligned to nonexistent contig if (!read.isUnmapped() && referenceIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { return false; } final SAMSequenceRecord contigHeader = header.getSequence(referenceIndex); // Read is aligned to a point after the end of the contig return read.isUnmapped() || read.getStart() <= contigHeader.getSequenceLength(); }
@BeforeClass private void init() throws IOException { reference = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); header.setSequenceDictionary(dictionary); header.setSortOrder(SAMFileHeader.SortOrder.coordinate); readGroup = new GATKSAMReadGroupRecord(header.getReadGroup("test")); final List<GATKSAMRecord> reads = new ArrayList<>(); for (final String contig : contigs) { for (int i = 1; i <= numReadsPerContig; i++) { reads.add(buildSAMRecord("read" + contig + "_" + i, contig, i)); } } createBAM(reads); }
/** * Initialize SAMTextWriter or a BAMFileWriter and possibly wrap in AsyncSAMFileWriter * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees * with header.sortOrder. * @param binary do we want to generate a BAM or a SAM * @param writer SAM or BAM writer to initialize and maybe wrap. */ private SAMFileWriter initWriter( final SAMFileHeader header, final boolean presorted, final boolean binary, final SAMFileWriterImpl writer) { writer.setSortOrder(header.getSortOrder(), presorted); if (maxRecordsInRam != null) { writer.setMaxRecordsInRam(maxRecordsInRam); } writer.setHeader(header); if (this.useAsyncIo) return new AsyncSAMFileWriter(writer, this.asyncOutputBufferSize); else return writer; }
private void initializeBAMWriter( final BAMFileWriter writer, final SAMFileHeader header, final boolean presorted, final boolean createIndex) { writer.setSortOrder(header.getSortOrder(), presorted); if (maxRecordsInRam != null) { writer.setMaxRecordsInRam(maxRecordsInRam); } writer.setHeader(header); if (createIndex && writer.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) { writer.enableBamIndexConstruction(); } }
@Test public void testSequenceDictionaryMerge() { final String sd1 = sq1 + sq2 + sq5; final String sd2 = sq2 + sq3 + sq4; SAMFileReader reader1 = new SAMFileReader(new ByteArrayInputStream(StringUtil.stringToBytes(sd1))); SAMFileReader reader2 = new SAMFileReader(new ByteArrayInputStream(StringUtil.stringToBytes(sd2))); final List<SAMFileHeader> inputHeaders = Arrays.asList(reader1.getFileHeader(), reader2.getFileHeader()); SamFileHeaderMerger merger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, inputHeaders, true); final SAMFileHeader mergedHeader = merger.getMergedHeader(); for (final SAMFileHeader inputHeader : inputHeaders) { int prevTargetIndex = -1; for (final SAMSequenceRecord sequenceRecord : inputHeader.getSequenceDictionary().getSequences()) { final int targetIndex = mergedHeader.getSequenceIndex(sequenceRecord.getSequenceName()); Assert.assertNotSame(targetIndex, -1); Assert.assertTrue(prevTargetIndex < targetIndex); prevTargetIndex = targetIndex; } } }
/** * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the * factory has been set to create an index, the header must be set to coordinate sorted. * * @param outputFile if this file has a .cram extension then a reference is required. Can not be * null. * @param referenceFile the reference source to use. Can not be null if a output file has a .cram * extension. * @param header header to be used for the output writer * @param preSorted if true then records must already be sorted to match the header sort order * @param factory SAMFileWriterFactory factory to use * @return SAMFileWriter */ public static SAMFileWriter createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted) { Utils.nonNull(outputFile); Utils.nonNull(header); if (null == referenceFile && outputFile.getName().endsWith(CramIO.CRAM_FILE_EXTENSION)) { throw new UserException("A reference file is required for writing CRAM files"); } return factory.makeWriter(header.clone(), preSorted, outputFile, referenceFile); }
protected int doWork() { IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); final SAMFileReader in = new SAMFileReader(INPUT); // create the read group we'll be using final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); rg.setLibrary(RGLB); rg.setPlatform(RGPL); rg.setSample(RGSM); rg.setPlatformUnit(RGPU); if (RGCN != null) rg.setSequencingCenter(RGCN); if (RGDS != null) rg.setDescription(RGDS); if (RGDT != null) rg.setRunDate(RGDT); log.info( String.format( "Created read group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample())); // create the new header and output file final SAMFileHeader inHeader = in.getFileHeader(); final SAMFileHeader outHeader = inHeader.clone(); outHeader.setReadGroups(Arrays.asList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); final SAMFileWriter outWriter = new SAMFileWriterFactory() .makeSAMOrBAMWriter( outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { read.setAttribute(SAMTag.RG.name(), RGID); outWriter.addAlignment(read); progress.record(read); } // cleanup in.close(); outWriter.close(); return 0; }
/** * Create a SAMTextWriter that is ready to receive SAMRecords. * * @param header entire header. Sort order is determined by the sortOrder property of this arg. * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees * with header.sortOrder. * @param outputFile where to write the output. */ public SAMFileWriter makeSAMWriter( final SAMFileHeader header, final boolean presorted, final File outputFile) { try { final SAMTextWriter ret = this.createMd5File ? new SAMTextWriter( new Md5CalculatingOutputStream( new FileOutputStream(outputFile, false), new File(outputFile.getAbsolutePath() + ".md5"))) : new SAMTextWriter(outputFile); ret.setSortOrder(header.getSortOrder(), presorted); if (maxRecordsInRam != null) { ret.setMaxRecordsInRam(maxRecordsInRam); } ret.setHeader(header); if (this.useAsyncIo) return new AsyncSAMFileWriter(ret, this.asyncOutputBufferSize); else return ret; } catch (final IOException ioe) { throw new RuntimeIOException("Error opening file: " + outputFile.getAbsolutePath()); } }
/** * Create a common SAMFileWriter for use with GATK tools. * * @param outputFile - if this file has a .cram extension then a reference is required. Can not be * null. * @param referenceFile - the reference source to use. Can not be null if a output file has a * .cram extension. * @param header - header to be used for the output writer * @param preSorted - if true then the records must already be sorted to match the header sort * order * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files * @param createMD5 - if true an MD5 file will be created * @return SAMFileWriter */ public static SAMFileWriter createCommonSAMWriter( final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5) { Utils.nonNull(outputFile); Utils.nonNull(header); if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { logger.warn( "Skipping index file creation for: " + outputFile.getAbsolutePath() + ". Index file creation requires reads in coordinate sorted order."); createOutputBamIndex = false; } final SAMFileWriterFactory factory = new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5); return ReadUtils.createCommonSAMWriterFromFactory( factory, outputFile, referenceFile, header, preSorted); }
@Test public void testChainProgramRecord() { SAMFileHeader header = new SAMFileHeader(); SAMProgramRecord first = header.createProgramRecord(); SAMUtils.chainSAMProgramRecord(header, first); Assert.assertEquals(header.getProgramRecords().size(), 1); Assert.assertNull(first.getPreviousProgramGroupId()); SAMProgramRecord second = header.createProgramRecord(); SAMUtils.chainSAMProgramRecord(header, second); Assert.assertEquals(header.getProgramRecords().size(), 2); Assert.assertNull(first.getPreviousProgramGroupId()); Assert.assertEquals(second.getPreviousProgramGroupId(), first.getProgramGroupId()); SAMProgramRecord third = new SAMProgramRecord("3"); SAMUtils.chainSAMProgramRecord(header, third); header.addProgramRecord(third); Assert.assertEquals(header.getProgramRecords().size(), 3); Assert.assertNull(first.getPreviousProgramGroupId()); Assert.assertEquals(second.getPreviousProgramGroupId(), first.getProgramGroupId()); Assert.assertEquals(third.getPreviousProgramGroupId(), second.getProgramGroupId()); }
@BeforeClass public void beforeClass() { header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); }
/** * Prepare to index a BAM. * * @param output Index will be written here. output will be closed when finish() method is called. * @param fileHeader header for the corresponding bam file. */ public BAMIndexer(final OutputStream output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader.getSequenceDictionary()); outputWriter = new BinaryBAMIndexWriter(numReferences, output); }
/** * HACK: This is used to make a copy of a header. Really, SAMFileHeader should provide a copy * constructor or a factory method. */ public static SAMFileHeader cloneSAMFileHeader(final SAMFileHeader header) { if (header == null) return null; return header.clone(); }
/** * Returns a {@link SAMReadGroupRecord} object corresponding to the provided read's read group. * * @param read read whose read group to retrieve * @param header SAM header containing read groups * @return a {@link SAMReadGroupRecord} object corresponding to the provided read's read group, or * null if the read has no read group */ public static SAMReadGroupRecord getSAMReadGroupRecord( final GATKRead read, final SAMFileHeader header) { final String readGroupName = read.getReadGroup(); return readGroupName != null ? header.getReadGroup(readGroupName) : null; }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }