/** * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY * * @param read */ public GATKSAMRecord(final SAMRecord read) { super(read.getHeader()); super.setReferenceIndex(read.getReferenceIndex()); super.setAlignmentStart(read.getAlignmentStart()); super.setReadName(read.getReadName()); super.setMappingQuality(read.getMappingQuality()); // indexing bin done below super.setCigar(read.getCigar()); super.setFlags(read.getFlags()); super.setMateReferenceIndex(read.getMateReferenceIndex()); super.setMateAlignmentStart(read.getMateAlignmentStart()); super.setInferredInsertSize(read.getInferredInsertSize()); SAMReadGroupRecord samRG = read.getReadGroup(); SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read); if (samAttr == null) { clearAttributes(); } else { setAttributes(samAttr); } if (samRG != null) { GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); setReadGroup(rg); } super.setFileSource(read.getFileSource()); super.setReadName(read.getReadName()); super.setCigarString(read.getCigarString()); super.setReadBases(read.getReadBases()); super.setBaseQualities(read.getBaseQualities()); // From SAMRecord constructor: Do this after the above because setCigarString will clear it. GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read)); }
@Test(dataProvider = "loadReadsADAM", groups = "spark") public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException { // Since the test requires that we not create the actual output directory in advance, // we instead create its parent directory and mark it for deletion on exit. This protects // us from naming collisions across multiple instances of the test suite. final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent"); final File outputDirectory = new File(outputParentDirectory, outputDirectoryName); JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null); SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null); ReadsSparkSink.writeReads( ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM); JavaRDD<GATKRead> rddParallelReads2 = readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header); Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count()); // Test the round trip List<GATKRead> samList = rddParallelReads.collect(); List<GATKRead> adamList = rddParallelReads2.collect(); Comparator<GATKRead> comparator = new ReadCoordinateComparator(header); samList.sort(comparator); adamList.sort(comparator); for (int i = 0; i < samList.size(); i++) { SAMRecord expected = samList.get(i).convertToSAMRecord(header); SAMRecord observed = adamList.get(i).convertToSAMRecord(header); // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM -> // BAM // see https://github.com/bigdatagenomics/adam/issues/823 Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname"); Assert.assertEquals( observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart"); Assert.assertEquals( observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd"); Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags"); Assert.assertEquals( observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality"); Assert.assertEquals( observed.getMateAlignmentStart(), expected.getMateAlignmentStart(), "getMateAlignmentStart"); Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar"); } }
/** * Returns a new qual array for read that includes the BAQ adjustment. Does not support on-the-fly * BAQ calculation * * @param read the SAMRecord to operate on * @param overwriteOriginalQuals If true, we replace the original qualities scores in the read * with their BAQ'd version * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ * annotation we just use the raw quality scores. Throws IllegalStateException is false and no * BAQ tag is present * @return */ public static byte[] calcBAQFromTag( SAMRecord read, boolean overwriteOriginalQuals, boolean useRawQualsIfNoBAQTag) { byte[] rawQuals = read.getBaseQualities(); byte[] newQuals = rawQuals; byte[] baq = getBAQTag(read); if (baq != null) { // Offset to base alignment quality (BAQ), of the same length as the read sequence. // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality. newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length]; for (int i = 0; i < rawQuals.length; i++) { int rawQual = (int) rawQuals[i]; int baq_delta = (int) baq[i] - 64; int newval = rawQual - baq_delta; if (newval < 0) throw new UserException.MalformedBAM( read, "BAQ tag error: the BAQ value is larger than the base quality"); newQuals[i] = (byte) newval; } } else if (!useRawQualsIfNoBAQTag) { throw new IllegalStateException( "Required BAQ tag to be present, but none was on read " + read.getReadName()); } return newQuals; }
/** Note: this is the only getKey function that handles unmapped reads specially! */ public static long getKey(final SAMRecord rec) { final int refIdx = rec.getReferenceIndex(); final int start = rec.getAlignmentStart(); if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start); // Put unmapped reads at the end, but don't give them all the exact same // key so that they can be distributed to different reducers. // // A random number would probably be best, but to ensure that the same // record always gets the same key we use a fast hash instead. // // We avoid using hashCode(), because it's not guaranteed to have the // same value across different processes. int hash = 0; byte[] var; if ((var = rec.getVariableBinaryRepresentation()) != null) { // Undecoded BAM record: just hash its raw data. hash = (int) MurmurHash3.murmurhash3(var, hash); } else { // Decoded BAM record or any SAM record: hash a few representative // fields together. hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash); } return getKey0(Integer.MAX_VALUE, hash); }
private String getIdentifier(SAMRecord read) { String id = read.getReadName(); if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { id += "_2"; } return id; }
@Override public SAMRecordPair getNextReadPair() { // insert first read into dictionary by queryname // insert second read into dictionary // check if the dictionary length for that entry has both pairs // if it is does return the read pair // otherwise continue reading // this way just return pairs as they are completed // should be MUCH faster // make sure to delete the entry after returning so that we dont have a memory leak if (iterator.hasNext()) { while (iterator.hasNext()) { SAMRecord record = iterator.next(); countRead(record); // skip if the read is unmapped, not properly paired or mate is unmapped if (record.getReadUnmappedFlag() == true || record.getProperPairFlag() == false || record.getMateUnmappedFlag() == true) { continue; } String query = record.getReadName(); // check if read mate has been read already if (readBuffer.containsKey(query)) { // if it has then return the pair SAMRecordPair pair = readBuffer.get(query); pair.addPair(record); if (pair.bothPairsAligned() && pair.isValidPair()) { // prevent memory leak by deleting keys that are no longer needed readBuffer.remove(query); return pair; } else { throw new RuntimeException(query + " is not properly mated"); } } else { // otherwise create an entry and store it by its query name SAMRecordPair pair = new SAMRecordPair(); pair.addPair(record); readBuffer.put(query, pair); } } } else { if (readBuffer.size() > 0) { for (String key : readBuffer.keySet()) { logger.info("No mate for for " + key); } throw new RuntimeException( "No mates found for some reads please make sure all reads are properly paired"); } } return null; }
// we need to pad ref by at least the bandwidth / 2 on either side public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read Pair<Integer, Integer> queryRange = calculateQueryRange(read); if (queryRange == null) return null; // read has Ns, or is completely clipped away int queryStart = queryRange.getFirst(); int queryEnd = queryRange.getSecond(); BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd); // cap quals int readI = 0, refI = 0; for (CigarElement elt : read.getCigar().getCigarElements()) { int l = elt.getLength(); switch (elt.getOperator()) { case N: // cannot handle these return null; case H: case P: // ignore pads and hard clips break; case S: refI += l; // move the reference too, in addition to I case I: // todo -- is it really the case that we want to treat I and S the same? for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i]; readI += l; break; case D: refI += l; break; case M: for (int i = readI; i < readI + l; i++) { int expectedPos = refI - refOffset + (i - readI); baqResult.bq[i] = capBaseByBAQ( baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos); } readI += l; refI += l; break; default: throw new ReviewedGATKException( "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); } } if (readI != read.getReadLength()) // odd cigar string System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length); return baqResult; }
/** * Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses * the BAQ tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and * does the BAQ on the fly using the refReader to obtain the reference bases as needed. * * @param read * @param refReader * @param calculationType * @return BQ qualities for use, in case qmode is DONT_MODIFY */ public byte[] baqRead( SAMRecord read, IndexedFastaSequenceFile refReader, CalculationMode calculationType, QualityMode qmode) { if (DEBUG) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName()); byte[] BAQQuals = read.getBaseQualities(); // in general we are overwriting quals, so just get a pointer to // them if (calculationType == CalculationMode.OFF) { // we don't want to do anything ; // just fall though } else if (excludeReadFromBAQ(read)) {; // just fall through } else { final boolean readHasBAQTag = hasBAQTag(read); if (calculationType == CalculationMode.RECALCULATE || !readHasBAQTag) { if (DEBUG) System.out.printf(" Calculating BAQ on the fly%n"); BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader); if (hmmResult != null) { switch (qmode) { case ADD_TAG: addBAQTag(read, hmmResult.bq); break; case OVERWRITE_QUALS: System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length); break; case DONT_MODIFY: BAQQuals = hmmResult.bq; break; default: throw new ReviewedGATKException("BUG: unexpected qmode " + qmode); } } else if (readHasBAQTag) { // remove the BAQ tag if it's there because we cannot trust it read.setAttribute(BAQ_TAG, null); } } else if (qmode == QualityMode.OVERWRITE_QUALS) { // only makes sense if we are overwriting quals if (DEBUG) System.out.printf(" Taking BAQ from tag%n"); // this overwrites the original qualities calcBAQFromTag(read, true, false); } } return BAQQuals; }
/** * Determine the appropriate start and stop offsets in the reads for the bases given the cigar * string * * @param read * @return */ private final Pair<Integer, Integer> calculateQueryRange(SAMRecord read) { int queryStart = -1, queryStop = -1; int readI = 0; // iterate over the cigar elements to determine the start and stop of the read bases for the BAQ // calculation for (CigarElement elt : read.getCigar().getCigarElements()) { switch (elt.getOperator()) { case N: return null; // cannot handle these case H: case P: case D: break; // ignore pads, hard clips, and deletions case I: case S: case M: case EQ: case X: int prev = readI; readI += elt.getLength(); if (includeClippedBases || elt.getOperator() != CigarOperator.S) { if (queryStart == -1) queryStart = prev; queryStop = readI; } // in the else case we aren't including soft clipped bases, so we don't update // queryStart or queryStop break; default: throw new ReviewedGATKException( "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); } } if (queryStop == queryStart) { // this read is completely clipped away, and yet is present in the file for some reason // usually they are flagged as non-PF, but it's possible to push them through the BAM // System.err.printf("WARNING -- read is completely clipped away: " + read.format()); return null; } return new Pair<Integer, Integer>(queryStart, queryStop); }
/** * Returns the BAQ adjusted quality score for this read at this offset. Does not support * on-the-fly BAQ calculation * * @param read the SAMRecord to operate on * @param offset the offset of operate on * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ * annotation we just use the raw quality scores. Throws IllegalStateException is false and no * BAQ tag is present * @return */ public static byte calcBAQFromTag(SAMRecord read, int offset, boolean useRawQualsIfNoBAQTag) { byte rawQual = read.getBaseQualities()[offset]; byte newQual = rawQual; byte[] baq = getBAQTag(read); if (baq != null) { // Offset to base alignment quality (BAQ), of the same length as the read sequence. // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality. int baq_delta = (int) baq[offset] - 64; int newval = rawQual - baq_delta; if (newval < 0) throw new UserException.MalformedBAM( read, "BAQ tag error: the BAQ value is larger than the base quality"); newQual = (byte) newval; } else if (!useRawQualsIfNoBAQTag) { throw new IllegalStateException( "Required BAQ tag to be present, but none was on read " + read.getReadName()); } return newQual; }
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { long numStartClippedBases = 0; long numEndClippedBases = 0; byte[] unclippedReadQuals = read.getBaseQualities(); byte[] unclippedReadBases = read.getReadBases(); // Do a stricter base clipping than provided by CIGAR string, since this one may be too // conservative, // and may leave a string of Q2 bases still hanging off the reads. for (int i = 0; i < read.getReadLength(); i++) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++; else break; } for (int i = read.getReadLength() - 1; i >= 0; i--) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++; else break; } // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases); if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) { return 0; /// Double.POSITIVE_INFINITY; } byte[] readBases = Arrays.copyOfRange( unclippedReadBases, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); byte[] readQuals = Arrays.copyOfRange( unclippedReadQuals, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); int readLength = readBases.length; // initialize path metric and traceback memories for Viterbi computation pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH]; bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH]; for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0; /* if (doSimpleCalculationModel) { // No Viterbi algorithm - assume no sequencing indel artifacts, // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap // of read with haplotype. int haplotypeIndex = initialIndexInHaplotype; double c = 0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability; // compute likelihood of portion of base to the left of the haplotype for (int indR=readStartIdx-1; indR >= 0; indR--) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; if (readQual <= 2) continue; double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0); // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) pRead += pBaseRead; } //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead); for (int indR=readStartIdx; indR < readBases.length; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; byte haplotypeBase; if (haplotypeIndex < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[haplotypeIndex]; else haplotypeBase = (byte)0; // dummy double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0); if (haplotypeBase != 0) pBaseRead += c; // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) if (readQual > 3) pRead += pBaseRead; haplotypeIndex++; if (haplotypeIndex >= haplotype.getBases().length) haplotypeIndex = RIGHT_ALIGN_INDEX; //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead); } //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k=0; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k=0; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.format("\nLikelihood:%f\n",pRead); } if (read.getReadName().contains("106880")) { System.out.println("aca"); System.out.println("Haplotype:"); for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.println("Read bases: "); for (int k=readStartIdx; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } } return pRead; } */ // Update path metric computations based on branch metric (Add/Compare/Select operations) // do forward direction first, ie from anchor to end of read // outer loop for (int indR = 0; indR < readLength; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { byte haplotypeBase; if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[indX - 1]; else haplotypeBase = readBase; updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual); } } // for debugging only: compute backtracking to find optimal route through trellis. Since I'm // only interested // in log-likelihood of best state, this isn't really necessary. double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k = 0; k < haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.println(); System.out.print("Read quals: "); for (int k = 0; k < readQuals.length; k++) { System.out.format("%d ", (int) readQuals[k]); } System.out.println(); // start from last position of read, go backwards to find optimal alignment int[] bestIndexArray = new int[readLength]; int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]); bestIndexArray[readLength - 1] = bestIndex; for (int k = readLength - 2; k >= 0; k--) { bestIndex = bestStateIndexArray[k][bestIndex]; bestIndexArray[k] = bestIndex; } System.out.print("Alignment: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%d ", bestIndexArray[k]); } System.out.println(); } // now just take optimum along all path metrics: that's the log likelihood of best alignment if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric); return bestMetric; }
/** * Test that PG header records are created & chained appropriately (or not created), and that the * PG record chains are as expected. MarkDuplicates is used both to merge and to mark dupes in * this case. * * @param suppressPg If true, do not create PG header record. * @param expectedPnVnByReadName For each read, info about the expect chain of PG records. */ @Test(dataProvider = "pgRecordChainingTest") public void pgRecordChainingTest( final boolean suppressPg, final Map<String, List<ExpectedPnAndVn>> expectedPnVnByReadName) { final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp"); outputDir.deleteOnExit(); try { // Run MarkDuplicates, merging the 3 input files, and either enabling or suppressing PG header // record creation according to suppressPg. final MarkDuplicates markDuplicates = new MarkDuplicates(); final ArrayList<String> args = new ArrayList<String>(); for (int i = 1; i <= 3; ++i) { args.add("INPUT=" + new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath()); } final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam"); args.add("OUTPUT=" + outputSam.getAbsolutePath()); args.add( "METRICS_FILE=" + new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics").getAbsolutePath()); if (suppressPg) args.add("PROGRAM_RECORD_ID=null"); // I generally prefer to call doWork rather than invoking the argument parser, but it is // necessary // in this case to initialize the command line. // Note that for the unit test, version won't come through because it is obtained through jar // manifest, and unit test doesn't run code from a jar. Assert.assertEquals(markDuplicates.instanceMain(args.toArray(new String[args.size()])), 0); // Read the MarkDuplicates output file, and get the PG ID for each read. In this particular // test, // the PG ID should be the same for both ends of a pair. final SamReader reader = SamReaderFactory.makeDefault().open(outputSam); final Map<String, String> pgIdForReadName = new HashMap<String, String>(); for (final SAMRecord rec : reader) { final String existingPgId = pgIdForReadName.get(rec.getReadName()); final String thisPgId = rec.getStringAttribute(SAMTag.PG.name()); if (existingPgId != null) { Assert.assertEquals(thisPgId, existingPgId); } else { pgIdForReadName.put(rec.getReadName(), thisPgId); } } final SAMFileHeader header = reader.getFileHeader(); CloserUtil.close(reader); // Confirm that for each read name, the chain of PG records contains exactly the number that // is expected, // and that values in the PG chain are as expected. for (final Map.Entry<String, List<ExpectedPnAndVn>> entry : expectedPnVnByReadName.entrySet()) { final String readName = entry.getKey(); final List<ExpectedPnAndVn> expectedList = entry.getValue(); String pgId = pgIdForReadName.get(readName); for (final ExpectedPnAndVn expected : expectedList) { final SAMProgramRecord programRecord = header.getProgramRecord(pgId); if (expected.expectedPn != null) Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn); if (expected.expectedVn != null) Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn); pgId = programRecord.getPreviousProgramGroupId(); } Assert.assertNull(pgId); } } finally { TestUtil.recursiveDelete(outputDir); } }
@Test public void testWithIndividualReadBarcodes() { final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester(); final String readNameOne = "RUNID:1:1:15993:13361"; final String readNameTwo = "RUNID:2:2:15993:13362"; final String readNameThree = "RUNID:3:3:15993:13362"; // first two reads have the same barcode (all three), third read has a different barcode for the // second end tester.addMatePair( readNameOne, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); tester.addMatePair( readNameTwo, 2, 41212324, 41212310, false, false, true, true, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); // same barcode as the first tester.addMatePair( readNameThree, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); final String barcodeTag = "BC"; final String readOneBarcodeTag = "BX"; // want the same tag as the second end, since this is allowed final String readTwoBarcodeTag = "BX"; for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) { record.setAttribute(barcodeTag, "Barcode1"); // same barcode if (record.getFirstOfPairFlag()) { // always the same value for the first end record.setAttribute(readOneBarcodeTag, "readOne1"); } else { // second end if (record.getReadName().equals(readNameOne) || record.getReadName().equals(readNameTwo)) { record.setAttribute(readTwoBarcodeTag, "readTwo1"); } else if (record.getReadName().equals(readNameThree)) { record.setAttribute(readTwoBarcodeTag, "readTwo2"); } } } tester.addArg("BARCODE_TAG=" + barcodeTag); tester.addArg("READ_ONE_BARCODE_TAG=" + readOneBarcodeTag); tester.addArg("READ_TWO_BARCODE_TAG=" + readTwoBarcodeTag); tester.runTest(); }
@Test public void testWithBarcodeComplex() { final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester(); final String readNameOne = "RUNID:1:1:15993:13361"; final String readNameTwo = "RUNID:2:2:15993:13362"; final String readNameThree = "RUNID:3:3:15993:13362"; // first two reads have the same barcode, third read has a different barcode tester.addMatePair( readNameOne, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); tester.addMatePair( readNameTwo, 2, 41212324, 41212310, false, false, true, true, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); // same barcode as the first tester.addMatePair( readNameThree, 2, 41212324, 41212310, false, false, false, false, "33S35M", "19S49M", true, true, false, false, false, DEFAULT_BASE_QUALITY); final String barcodeTag = "BC"; for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) { if (record.getReadName().equals(readNameOne) || record.getReadName().equals(readNameTwo)) { record.setAttribute(barcodeTag, "Barcode1"); } else if (record.getReadName().equals(readNameThree)) { record.setAttribute(barcodeTag, "Barcode2"); } } tester.addArg("BARCODE_TAG=" + barcodeTag); tester.runTest(); }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }