@BeforeTest void setupBuilder() throws IOException { tempSamFileChrM_O = File.createTempFile("CollectGcBias", ".bam", TEST_DIR); tempSamFileAllChr = File.createTempFile("CollectGcBias", ".bam", TEST_DIR); tempSamFileChrM_O.deleteOnExit(); tempSamFileAllChr.deleteOnExit(); final File tempSamFileUnsorted = File.createTempFile("CollectGcBias", ".bam", TEST_DIR); tempSamFileUnsorted.deleteOnExit(); final SAMFileHeader header = new SAMFileHeader(); try { header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(dict)); header.setSortOrder(SAMFileHeader.SortOrder.unsorted); } catch (final SAMException e) { e.printStackTrace(); } // build different levels to put into the same bam file for testing multi level collection setupTest1( 1, readGroupId1, readGroupRecord1, sample1, library1, header, setBuilder1); // Sample 1, Library 1, RG 1 setupTest1( 2, readGroupId2, readGroupRecord2, sample1, library2, header, setBuilder2); // Sample 1, Library 2, RG 2 setupTest1( 3, readGroupId3, readGroupRecord3, sample2, library3, header, setBuilder3); // Sample 2, Library 3, RG 3 // build one last readgroup for comparing that window count stays the same whether you use all // contigs or not setupTest2(1, readGroupId1, readGroupRecord1, sample1, library1, header, setBuilder4); final List<SAMRecordSetBuilder> test1Builders = new ArrayList<SAMRecordSetBuilder>(); test1Builders.add(setBuilder1); test1Builders.add(setBuilder2); test1Builders.add(setBuilder3); final List<SAMRecordSetBuilder> test2Builders = new ArrayList<SAMRecordSetBuilder>(); test2Builders.add(setBuilder4); tempSamFileChrM_O = build(test1Builders, tempSamFileUnsorted, header); tempSamFileAllChr = build(test2Builders, tempSamFileUnsorted, header); }
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link BAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close(); isInitialized = true; final Configuration conf = ctx.getConfiguration(); final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); this.stringency = SAMHeaderReader.getValidationStringency(conf); final FSDataInputStream in = fs.open(file); final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); codec = new BAMRecordCodec(header); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart); codec.setInputStream(bci); if (BAMInputFormat.DEBUG_BAM_SPLITTER) { final long recordStart = virtualStart & 0xffff; System.err.println( "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart); } keepReadPairsTogether = SortOrder.queryname.equals(header.getSortOrder()) && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false); readPair = false; lastOfPair = false; intervals = BAMInputFormat.getIntervals(conf); if (intervals != null) { overlapDetector = new OverlapDetector<>(0, 0); overlapDetector.addAll(intervals, intervals); } }
@Override protected void setup(final SAMFileHeader header, final File samFile) { final String outext = (null != FILE_EXTENSION) ? FILE_EXTENSION : ""; // Add a file extension if desired preAdapterSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_SUMMARY_EXT + outext); preAdapterDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_DETAILS_EXT + outext); baitBiasSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_SUMMARY_EXT + outext); baitBiasDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT + outext); IOUtil.assertFileIsWritable(preAdapterSummaryOut); IOUtil.assertFileIsWritable(preAdapterDetailsOut); IOUtil.assertFileIsWritable(baitBiasSummaryOut); IOUtil.assertFileIsWritable(baitBiasDetailsOut); for (final SAMReadGroupRecord rec : header.getReadGroups()) { samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE)); libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY)); } if (INTERVALS != null) { IOUtil.assertFileIsReadable(INTERVALS); intervalMask = new IntervalListReferenceSequenceMask(IntervalList.fromFile(INTERVALS).uniqued()); } if (DB_SNP != null) { IOUtil.assertFileIsReadable(DB_SNP); dbSnpMask = new DbSnpBitSetUtil(DB_SNP, header.getSequenceDictionary()); } // set record-level filters final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>(); filters.add(new FailsVendorReadQualityFilter()); filters.add(new NotPrimaryAlignmentFilter()); filters.add(new DuplicateReadFilter()); filters.add(new AlignedFilter(true)); // discard unmapped reads filters.add(new MappingQualityFilter(MINIMUM_MAPPING_QUALITY)); if (!INCLUDE_UNPAIRED) { final int effectiveMaxInsertSize = (MAXIMUM_INSERT_SIZE == 0) ? Integer.MAX_VALUE : MAXIMUM_INSERT_SIZE; filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, effectiveMaxInsertSize)); } recordFilter = new AggregateFilter(filters); // set up the artifact counters final String sampleAlias = StringUtil.join(",", new ArrayList<String>(samples)); for (final String library : libraries) { artifactCounters.put( library, new ArtifactCounter(sampleAlias, library, CONTEXT_SIZE, TANDEM_READS)); } }
/** Ensure that basic read group splitting works. */ @Test public void testSplitByReadGroup() { SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10); read3.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10); read4.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read5 = ArtificialSAMUtils.createArtificialRead(header, "read5", 0, 1, 10); read5.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read6 = ArtificialSAMUtils.createArtificialRead(header, "read6", 0, 1, 10); read6.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read7 = ArtificialSAMUtils.createArtificialRead(header, "read7", 0, 1, 10); read7.setAttribute("RG", readGroupOne.getId()); ReadBackedPileup pileup = new ReadBackedPileupImpl( null, Arrays.asList(read1, read2, read3, read4, read5, read6, read7), Arrays.asList(1, 1, 1, 1, 1, 1, 1)); ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1"); List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 4, "Wrong number of reads in read group rg1"); Assert.assertEquals( rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(2), read6, "Read " + read6.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(3), read7, "Read " + read7.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = pileup.getPileupForReadGroup("rg2"); List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg2Reads.size(), 3, "Wrong number of reads in read group rg2"); Assert.assertEquals( rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(2), read5, "Read " + read5.getReadName() + " should be in rg2 but isn't"); }
/** Ensure that splitting read groups still works when dealing with a sample-split pileup. */ @Test public void testSplitBySample() { SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); readGroupOne.setSample("sample1"); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); readGroupTwo.setSample("sample2"); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10); read3.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10); read4.setAttribute("RG", readGroupTwo.getId()); ReadBackedPileupImpl sample1Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read1, read3), Arrays.asList(1, 1)); ReadBackedPileupImpl sample2Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read2, read4), Arrays.asList(1, 1)); Map<String, ReadBackedPileupImpl> sampleToPileupMap = new HashMap<String, ReadBackedPileupImpl>(); sampleToPileupMap.put(readGroupOne.getSample(), sample1Pileup); sampleToPileupMap.put(readGroupTwo.getSample(), sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null, sampleToPileupMap); ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1"); List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1"); Assert.assertEquals( rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals( rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2"); List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2"); Assert.assertEquals( rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals( rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't"); }
/** * Returns the reference index in the given header of the contig of the read's mate, or {@link * SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped. * * @param read read whose mate's reference index to look up * @param header SAM header defining contig indices * @return the reference index in the given header of the contig of the read's mate, or {@link * SAMRecord#NO_ALIGNMENT_REFERENCE_INDEX} if the read's mate is unmapped. */ public static int getMateReferenceIndex(final GATKRead read, final SAMFileHeader header) { if (read.mateIsUnmapped()) { return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } return header.getSequenceIndex(read.getMateContig()); }
private void saveResults( final MetricsFile<?, Integer> metrics, final SAMFileHeader readsHeader, final String inputFileName) { MetricsUtils.saveMetrics(metrics, out, getAuthHolder()); if (metrics.getAllHistograms().isEmpty()) { logger.warn("No valid bases found in input file."); } else if (chartOutput != null) { // Now run R to generate a chart // If we're working with a single library, assign that library's name // as a suffix to the plot title final List<SAMReadGroupRecord> readGroups = readsHeader.getReadGroups(); /* * A subtitle for the plot, usually corresponding to a library. */ String plotSubtitle = ""; if (readGroups.size() == 1) { plotSubtitle = StringUtil.asEmptyIfNull(readGroups.get(0).getLibrary()); } final RScriptExecutor executor = new RScriptExecutor(); executor.addScript(new Resource(MeanQualityByCycle.R_SCRIPT, MeanQualityByCycle.class)); executor.addArgs(out, chartOutput.getAbsolutePath(), inputFileName, plotSubtitle); executor.exec(); } }
/** * Construct an artificial SAM file reader with the given SAM file header * * @param customHeader Header that should be returned by calls to getFileHeader() on this reader * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMFileHeader customHeader, SAMRecord... reads) { super(createEmptyInputStream(), true); this.customHeader = customHeader; this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); this.reads = Arrays.asList(reads); }
public List<String> getSequenceNames() { if (sequenceNames == null) { SAMFileHeader header = getFileHeader(); if (header == null) { return null; } sequenceNames = new ArrayList(); List<SAMSequenceRecord> records = header.getSequenceDictionary().getSequences(); if (records.size() > 0) { for (SAMSequenceRecord rec : header.getSequenceDictionary().getSequences()) { String chr = rec.getSequenceName(); sequenceNames.add(chr); } } } return sequenceNames; }
@Test public void testAddCommentsToBam() throws Exception { final File outputFile = File.createTempFile("addCommentsToBamTest.", BamFileIoUtils.BAM_FILE_EXTENSION); runIt(BAM_FILE, outputFile, commentList); final SAMFileHeader newHeader = SamReaderFactory.makeDefault().getFileHeader(outputFile); // The original comments are massaged when they're added to the header. Perform the same // massaging here, // and then compare the lists final List<String> massagedComments = new LinkedList<>(); for (final String comment : commentList) { massagedComments.add(SAMTextHeaderCodec.COMMENT_PREFIX + comment); } Assert.assertEquals(newHeader.getComments(), massagedComments); }
@Test public void realignAtContigBorderTest() { final int contigEnd = header.getSequence(0).getSequenceLength(); final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "goodRead", 0, contigEnd - 1, 2); read.setCigarString("2M"); Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), false); read.setCigarString("1M1D1M"); Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), true); }
/** * Check to ensure that the alignment makes sense based on the contents of the header. * * @param header The SAM file header. * @param read The read to verify. * @return true if alignment agrees with header, false otherwise. */ public static boolean alignmentAgreesWithHeader(final SAMFileHeader header, final GATKRead read) { final int referenceIndex = getReferenceIndex(read, header); // Read is aligned to nonexistent contig if (!read.isUnmapped() && referenceIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { return false; } final SAMSequenceRecord contigHeader = header.getSequence(referenceIndex); // Read is aligned to a point after the end of the contig return read.isUnmapped() || read.getStart() <= contigHeader.getSequenceLength(); }
@Test public void testGetPileupForSample() { String sample1 = "sample1"; String sample2 = "sample2"; SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); readGroupOne.setSample(sample1); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); readGroupTwo.setSample(sample2); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10); read1.setAttribute("RG", readGroupOne.getId()); GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10); read2.setAttribute("RG", readGroupTwo.getId()); Map<String, ReadBackedPileupImpl> sampleToPileupMap = new HashMap<String, ReadBackedPileupImpl>(); sampleToPileupMap.put( sample1, new ReadBackedPileupImpl(null, Collections.singletonList(read1), 0)); sampleToPileupMap.put( sample2, new ReadBackedPileupImpl(null, Collections.singletonList(read2), 0)); ReadBackedPileup pileup = new ReadBackedPileupImpl(null, sampleToPileupMap); ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2); Assert.assertEquals( sample2Pileup.getNumberOfElements(), 1, "Sample 2 pileup has wrong number of elements"); Assert.assertEquals( sample2Pileup.getReads().get(0), read2, "Sample 2 pileup has incorrect read"); ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing"); Assert.assertNull(missingSamplePileup, "Pileup for sample 'missing' should be null but isn't"); missingSamplePileup = pileup.getPileupForSample("not here"); Assert.assertNull(missingSamplePileup, "Pileup for sample 'not here' should be null but isn't"); }
@Override protected void setupPipeline(Pipeline pipeline) { // Load the reads. final ReadsDataflowSource readsDataflowSource = new ReadsDataflowSource(bam, pipeline); final SAMFileHeader readsHeader = readsDataflowSource.getHeader(); final List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary()) : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary()); final PCollectionView<SAMFileHeader> headerSingleton = ReadsDataflowSource.getHeaderView(pipeline, readsHeader); final PCollection<GATKRead> initialReads = readsDataflowSource.getReadPCollection(intervals); // Apply MarkDuplicates to produce updated GATKReads. final PCollection<GATKRead> markedReads = initialReads.apply(new MarkDuplicates(headerSingleton)); // Load the Variants and the Reference and join them to reads. final VariantsDataflowSource variantsDataflowSource = new VariantsDataflowSource(baseRecalibrationKnownVariants, pipeline); Map<String, String> referenceNameToIdTable = RefAPISource.buildReferenceNameToIdTable(pipeline.getOptions(), referenceName); RefAPIMetadata refAPIMetadata = new RefAPIMetadata(referenceName, referenceNameToIdTable); final PCollection<KV<GATKRead, ReadContextData>> readsWithContext = AddContextDataToRead.add(markedReads, refAPIMetadata, variantsDataflowSource); // Apply BQSR. final PCollection<RecalibrationTables> recalibrationReports = readsWithContext.apply(new BaseRecalibratorStub(headerSingleton)); final PCollectionView<RecalibrationTables> mergedRecalibrationReport = recalibrationReports.apply(View.<RecalibrationTables>asSingleton()); final PCollection<GATKRead> finalReads = markedReads.apply(new ApplyBQSRStub(headerSingleton, mergedRecalibrationReport)); SmallBamWriter.writeToFile(pipeline, finalReads, readsHeader, output); }
@Override protected Object doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); if (INPUT.getAbsolutePath().endsWith(".sam")) { throw new UserException("SAM files are not supported"); } final SAMFileHeader samFileHeader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(INPUT); for (final String comment : COMMENT) { if (comment.contains("\n")) { throw new UserException("Comments can not contain a new line"); } samFileHeader.addComment(comment); } BamFileIoUtils.reheaderBamFile(samFileHeader, INPUT, OUTPUT, CREATE_MD5_FILE, CREATE_INDEX); return null; }
/** * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the * factory has been set to create an index, the header must be set to coordinate sorted. * * @param outputFile if this file has a .cram extension then a reference is required. Can not be * null. * @param referenceFile the reference source to use. Can not be null if a output file has a .cram * extension. * @param header header to be used for the output writer * @param preSorted if true then records must already be sorted to match the header sort order * @param factory SAMFileWriterFactory factory to use * @return SAMFileWriter */ public static SAMFileWriter createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted) { Utils.nonNull(outputFile); Utils.nonNull(header); if (null == referenceFile && outputFile.getName().endsWith(CramIO.CRAM_FILE_EXTENSION)) { throw new UserException("A reference file is required for writing CRAM files"); } return factory.makeWriter(header.clone(), preSorted, outputFile, referenceFile); }
public void onTraversalDone(EventPair sum) { if (sum.left != null && sum.left.isReportableEvent()) sum.intervals.add(sum.left.getLoc()); if (sum.right != null && sum.right.isReportableEvent()) sum.intervals.add(sum.right.getLoc()); if (FilenameUtils.getExtension(out.getName()).equals("interval_list")) { final SAMFileHeader masterSequenceDictionaryHeader = new SAMFileHeader(); masterSequenceDictionaryHeader.setSequenceDictionary( getToolkit().getMasterSequenceDictionary()); final IntervalList intervalList = new IntervalList(masterSequenceDictionaryHeader); for (GenomeLoc loc : sum.intervals) { intervalList.add(new Interval(loc.getContig(), loc.getStart(), loc.getStop())); } intervalList.write(out); } else { try (BufferedWriter bufferedWriter = IOUtil.openFileForBufferedWriting(out)) { for (GenomeLoc loc : sum.intervals) { bufferedWriter.write(loc.toString()); bufferedWriter.newLine(); } } catch (final IOException e) { throw new GATKException("Error writing out intervals to file: " + out.getAbsolutePath(), e); } } }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Pulldown)) { return false; } if (!super.equals(o)) { return false; } final Pulldown pulldown = (Pulldown) o; return header.equals(pulldown.header); }
///////////////////////////////////////////////////////////////////////////// // Used to generate the Sam Record Sets with SamRecordSetBuilder.addPair(). // testNumber 1: runGcBiasMultiLevelTest, generates records aligning to chrM and chrO // testNumber 2: runWindowsComparisonTest, generates records aligning to chrM,N,O. ///////////////////////////////////////////////////////////////////////////// public void setupTest1( final int ID, final String readGroupId, final SAMReadGroupRecord readGroupRecord, final String sample, final String library, final SAMFileHeader header, final SAMRecordSetBuilder setBuilder) throws IOException { final String separator = ":"; final int contig1 = 0; final int contig2 = 1; readGroupRecord.setSample(sample); readGroupRecord.setPlatform(platform); readGroupRecord.setLibrary(library); readGroupRecord.setPlatformUnit(readGroupId); header.addReadGroup(readGroupRecord); setBuilder.setReadGroup(readGroupRecord); setBuilder.setUseNmFlag(true); setBuilder.setHeader(header); final int max = 800; final int min = 1; final Random rg = new Random(5); // add records that align to chrM and O but not N for (int i = 0; i < NUM_READS; i++) { final int start = rg.nextInt(max) + min; final String newReadName = READ_NAME + separator + ID + separator + i; if (i != NUM_READS - 1) { setBuilder.addPair(newReadName, contig1, start + ID, start + ID + LENGTH); } else { setBuilder.addPair(newReadName, contig2, start + ID, start + ID + LENGTH); } } }
/** * Create a common SAMFileWriter for use with GATK tools. * * @param outputFile - if this file has a .cram extension then a reference is required. Can not be * null. * @param referenceFile - the reference source to use. Can not be null if a output file has a * .cram extension. * @param header - header to be used for the output writer * @param preSorted - if true then the records must already be sorted to match the header sort * order * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files * @param createMD5 - if true an MD5 file will be created * @return SAMFileWriter */ public static SAMFileWriter createCommonSAMWriter( final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5) { Utils.nonNull(outputFile); Utils.nonNull(header); if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { logger.warn( "Skipping index file creation for: " + outputFile.getAbsolutePath() + ". Index file creation requires reads in coordinate sorted order."); createOutputBamIndex = false; } final SAMFileWriterFactory factory = new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5); return ReadUtils.createCommonSAMWriterFromFactory( factory, outputFile, referenceFile, header, preSorted); }
/** * Returns a {@link SAMReadGroupRecord} object corresponding to the provided read's read group. * * @param read read whose read group to retrieve * @param header SAM header containing read groups * @return a {@link SAMReadGroupRecord} object corresponding to the provided read's read group, or * null if the read has no read group */ public static SAMReadGroupRecord getSAMReadGroupRecord( final GATKRead read, final SAMFileHeader header) { final String readGroupName = read.getReadGroup(); return readGroupName != null ? header.getReadGroup(readGroupName) : null; }
public static List<String> getReadGroupIDs(final SAMFileHeader header) { return header.getReadGroups().stream().map(rg -> getID(rg)).collect(Collectors.toList()); }
/** Combines multiple SAM/BAM files into one. */ @Override protected int doWork() { boolean matchedSortOrders = true; // read interval list if it is defined final List<Interval> intervalList = (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals()); // map reader->iterator used if INTERVALS is defined final Map<SamReader, CloseableIterator<SAMRecord>> samReaderToIterator = new HashMap<SamReader, CloseableIterator<SAMRecord>>(INPUT.size()); // Open the files for reading and writing final List<SamReader> readers = new ArrayList<SamReader>(); final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); { SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory for (final File inFile : INPUT) { IOUtil.assertFileIsReadable(inFile); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile); if (INTERVALS != null) { if (!in.hasIndex()) throw new PicardException( "Merging with interval but Bam file is not indexed " + inFile); final CloseableIterator<SAMRecord> samIterator = new SamRecordIntervalIteratorFactory() .makeSamRecordIntervalIterator(in, intervalList, true); samReaderToIterator.put(in, samIterator); } readers.add(in); headers.add(in.getFileHeader()); // A slightly hackish attempt to keep memory consumption down when merging multiple files // with // large sequence dictionaries (10,000s of sequences). If the dictionaries are identical, // then // replace the duplicate copies with a single dictionary to reduce the memory footprint. if (dict == null) { dict = in.getFileHeader().getSequenceDictionary(); } else if (dict.equals(in.getFileHeader().getSequenceDictionary())) { in.getFileHeader().setSequenceDictionary(dict); } matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER; } } // If all the input sort orders match the output sort order then just merge them and // write on the fly, otherwise setup to merge and sort before writing out the final file IOUtil.assertFileIsWritable(OUTPUT); final boolean presorted; final SAMFileHeader.SortOrder headerMergerSortOrder; final boolean mergingSamRecordIteratorAssumeSorted; if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted || ASSUME_SORTED || INTERVALS != null) { log.info( "Input files are in same order as output so sorting to temp directory is not needed."); headerMergerSortOrder = SORT_ORDER; mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED; presorted = true; } else { log.info("Sorting input files using temp directory " + TMP_DIR); headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted; mergingSamRecordIteratorAssumeSorted = false; presorted = false; } final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES); final MergingSamRecordIterator iterator; // no interval defined, get an iterator for the whole bam if (intervalList == null) { iterator = new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted); } else { // show warning related to https://github.com/broadinstitute/picard/pull/314/files log.info( "Warning: merged bams from different interval lists may contain the same read in both files"); iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true); } final SAMFileHeader header = headerMerger.getMergedHeader(); for (final String comment : COMMENT) { header.addComment(comment); } header.setSortOrder(SORT_ORDER); final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory(); if (USE_THREADING) { samFileWriterFactory.setUseAsyncIo(true); } final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT); // Lastly loop through and write out the records final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL); while (iterator.hasNext()) { final SAMRecord record = iterator.next(); out.addAlignment(record); progress.record(record); } log.info("Finished reading inputs."); for (final CloseableIterator<SAMRecord> iter : samReaderToIterator.values()) CloserUtil.close(iter); CloserUtil.close(readers); out.close(); return 0; }
@BeforeClass public void beforeClass() { header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); loc = genomeLocParser.createGenomeLoc("chr1", 1); }
@Override public int hashCode() { int result = super.hashCode(); result = 31 * result + header.hashCode(); return result; }
@BeforeClass public void init() { header = ArtificialReadUtils.createArtificialSamHeader(1, 1, 1000000); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); }
private SAMFileHeader makeHeader() { final SAMFileHeader header = new SAMFileHeader(); final SAMSequenceDictionary dict = header.getSequenceDictionary(); dict.addSequence(new SAMSequenceRecord("chr20", 62435964)); return header; }
/** * Test that PG header records are created & chained appropriately (or not created), and that the * PG record chains are as expected. MarkDuplicates is used both to merge and to mark dupes in * this case. * * @param suppressPg If true, do not create PG header record. * @param expectedPnVnByReadName For each read, info about the expect chain of PG records. */ @Test(dataProvider = "pgRecordChainingTest") public void pgRecordChainingTest( final boolean suppressPg, final Map<String, List<ExpectedPnAndVn>> expectedPnVnByReadName) { final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp"); outputDir.deleteOnExit(); try { // Run MarkDuplicates, merging the 3 input files, and either enabling or suppressing PG header // record creation according to suppressPg. final MarkDuplicates markDuplicates = new MarkDuplicates(); final ArrayList<String> args = new ArrayList<String>(); for (int i = 1; i <= 3; ++i) { args.add("INPUT=" + new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath()); } final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam"); args.add("OUTPUT=" + outputSam.getAbsolutePath()); args.add( "METRICS_FILE=" + new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics").getAbsolutePath()); if (suppressPg) args.add("PROGRAM_RECORD_ID=null"); // I generally prefer to call doWork rather than invoking the argument parser, but it is // necessary // in this case to initialize the command line. // Note that for the unit test, version won't come through because it is obtained through jar // manifest, and unit test doesn't run code from a jar. Assert.assertEquals(markDuplicates.instanceMain(args.toArray(new String[args.size()])), 0); // Read the MarkDuplicates output file, and get the PG ID for each read. In this particular // test, // the PG ID should be the same for both ends of a pair. final SamReader reader = SamReaderFactory.makeDefault().open(outputSam); final Map<String, String> pgIdForReadName = new HashMap<String, String>(); for (final SAMRecord rec : reader) { final String existingPgId = pgIdForReadName.get(rec.getReadName()); final String thisPgId = rec.getStringAttribute(SAMTag.PG.name()); if (existingPgId != null) { Assert.assertEquals(thisPgId, existingPgId); } else { pgIdForReadName.put(rec.getReadName(), thisPgId); } } final SAMFileHeader header = reader.getFileHeader(); CloserUtil.close(reader); // Confirm that for each read name, the chain of PG records contains exactly the number that // is expected, // and that values in the PG chain are as expected. for (final Map.Entry<String, List<ExpectedPnAndVn>> entry : expectedPnVnByReadName.entrySet()) { final String readName = entry.getKey(); final List<ExpectedPnAndVn> expectedList = entry.getValue(); String pgId = pgIdForReadName.get(readName); for (final ExpectedPnAndVn expected : expectedList) { final SAMProgramRecord programRecord = header.getProgramRecord(pgId); if (expected.expectedPn != null) Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn); if (expected.expectedVn != null) Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn); pgId = programRecord.getPreviousProgramGroupId(); } Assert.assertNull(pgId); } } finally { TestUtil.recursiveDelete(outputDir); } }
@Override public int doWork(String[] args) { boolean compressed = false; int maxRecordsInRAM = 100000; long count = -1L; File fileout = null; com.github.lindenb.jvarkit.util.cli.GetOpt opt = new com.github.lindenb.jvarkit.util.cli.GetOpt(); int c; while ((c = opt.getopt(args, getGetOptDefault() + "o:n:N:T:b")) != -1) { switch (c) { case 'b': compressed = true; break; case 'N': maxRecordsInRAM = Integer.parseInt(opt.getOptArg()); break; case 'n': count = Long.parseLong(opt.getOptArg()); break; case 'o': fileout = new File(opt.getOptArg()); break; case 'T': this.addTmpDirectory(new File(opt.getOptArg())); break; default: { switch (handleOtherOptions(c, opt, null)) { case EXIT_FAILURE: return -1; case EXIT_SUCCESS: return 0; default: break; } } } } if (count < -1L) // -1 == infinite { error("Bad count:" + count); return -1; } SamReader samReader = null; SAMRecordIterator iter = null; SAMFileWriter samWriter = null; Random random = new Random(); CloseableIterator<RandSamRecord> iter2 = null; try { SamFileReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); if (opt.getOptInd() == args.length) { info("Reading from stdin"); samReader = SamFileReaderFactory.mewInstance().openStdin(); } else if (opt.getOptInd() + 1 == args.length) { File filename = new File(args[opt.getOptInd()]); info("Reading from " + filename); samReader = SamFileReaderFactory.mewInstance().open(filename); } else { error("Illegal number of arguments."); return -1; } SAMFileHeader header = samReader.getFileHeader(); header = header.clone(); header.setSortOrder(SortOrder.unsorted); header.addComment("Processed with " + getProgramName() + " : " + getProgramCommandLine()); SAMFileWriterFactory sfw = new SAMFileWriterFactory(); sfw.setCreateIndex(false); sfw.setCreateMd5File(false); if (fileout == null) { if (compressed) { samWriter = sfw.makeBAMWriter(header, true, System.out); } else { samWriter = sfw.makeSAMWriter(header, true, System.out); } } else { samWriter = sfw.makeSAMOrBAMWriter(header, true, fileout); this.addTmpDirectory(fileout); } iter = samReader.iterator(); SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(samReader.getFileHeader().getSequenceDictionary()); SortingCollection<RandSamRecord> sorter = SortingCollection.newInstance( RandSamRecord.class, new RandSamRecordCodec(header), new RandSamRecordComparator(), maxRecordsInRAM, getTmpDirectories()); sorter.setDestructiveIteration(true); while (iter.hasNext()) { RandSamRecord r = new RandSamRecord(); r.rand_index = random.nextInt(); r.samRecord = progress.watch(iter.next()); sorter.add(r); } iter.close(); iter = null; sorter.doneAdding(); iter2 = sorter.iterator(); if (count == -1) { while (iter2.hasNext()) { samWriter.addAlignment(iter2.next().samRecord); } } else { while (iter2.hasNext() && count > 0) { samWriter.addAlignment(iter2.next().samRecord); count--; } } iter2.close(); iter2 = null; sorter.cleanup(); progress.finish(); } catch (Exception e) { error(e); return -1; } finally { CloserUtil.close(iter); CloserUtil.close(iter2); CloserUtil.close(samReader); CloserUtil.close(samWriter); } return 0; }
/** * HACK: This is used to make a copy of a header. Really, SAMFileHeader should provide a copy * constructor or a factory method. */ public static SAMFileHeader cloneSAMFileHeader(final SAMFileHeader header) { if (header == null) return null; return header.clone(); }