private void collectReadData(final SAMRecord record, final ReferenceSequence ref) { metrics.TOTAL_READS++; readLengthHistogram.increment(record.getReadBases().length); if (!record.getReadFailsVendorQualityCheckFlag()) { metrics.PF_READS++; if (isNoiseRead(record)) metrics.PF_NOISE_READS++; if (record.getReadUnmappedFlag()) { // If the read is unmapped see if it's adapter sequence final byte[] readBases = record.getReadBases(); if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases); if (isAdapterSequence(readBases)) { this.adapterReads++; } } else if (doRefMetrics) { metrics.PF_READS_ALIGNED++; if (!record.getReadNegativeStrandFlag()) numPositiveStrand++; if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { metrics.READS_ALIGNED_IN_PAIRS++; // Check that both ends have mapq > minimum final Integer mateMq = record.getIntegerAttribute("MQ"); if (mateMq == null || mateMq >= MAPPING_QUALITY_THRESOLD && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) { ++this.chimerasDenominator; // With both reads mapped we can see if this pair is chimeric if (Math.abs(record.getInferredInsertSize()) > maxInsertSize || !record.getReferenceIndex().equals(record.getMateReferenceIndex())) { ++this.chimeras; } } } } } }
@Override public int doWork(String[] args) { File refFile = null; com.github.lindenb.jvarkit.util.cli.GetOpt getopt = new com.github.lindenb.jvarkit.util.cli.GetOpt(); int c; while ((c = getopt.getopt(args, "hvL:r:")) != -1) { switch (c) { case 'h': printUsage(); return 0; case 'v': System.out.println(getVersion()); return 0; case 'L': getLogger().setLevel(java.util.logging.Level.parse(getopt.getOptArg())); break; case 'r': refFile = new File(getopt.getOptArg()); break; case ':': System.err.println("Missing argument for option -" + getopt.getOptOpt()); return -1; default: System.err.println("Unknown option -" + getopt.getOptOpt()); return -1; } } if (refFile == null) { error("Undefined REF file"); return -1; } File bamFile = null; if (getopt.getOptInd() + 1 != args.length) { info("reading from stdin."); } else { bamFile = new File(args[getopt.getOptInd()]); } IndexedFastaSequenceFile indexedFastaSequenceFile = null; SAMFileReader samFileReader = null; try { GenomicSequence genomicSequence = null; indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile); SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); samFileReader = null; if (bamFile == null) { samFileReader = new SAMFileReader(System.in); } else { samFileReader = new SAMFileReader(bamFile); } XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance(); XMLStreamWriter w = xmlfactory.createXMLStreamWriter(System.out, "UTF-8"); w.writeStartDocument("UTF-8", "1.0"); w.writeStartElement("sam"); w.writeComment(getProgramCommandLine()); w.writeAttribute("ref", (bamFile == null ? "stdin" : bamFile.getPath())); w.writeAttribute("bam", args[1]); SAMRecordIterator iter = samFileReader.iterator(); while (iter.hasNext()) { SAMRecord rec = iter.next(); final byte readbases[] = rec.getReadBases(); w.writeStartElement("read"); w.writeStartElement("name"); w.writeCharacters(rec.getReadName()); w.writeEndElement(); w.writeStartElement("sequence"); w.writeCharacters(new String(readbases)); w.writeEndElement(); w.writeStartElement("flags"); w.writeAttribute("paired", String.valueOf(rec.getReadPairedFlag())); w.writeAttribute( "failsVendorQual", String.valueOf(rec.getReadFailsVendorQualityCheckFlag())); w.writeAttribute("mapped", String.valueOf(!rec.getReadUnmappedFlag())); w.writeAttribute("strand", (rec.getReadNegativeStrandFlag() ? "-" : "+")); if (rec.getReadPairedFlag()) { w.writeAttribute("mate-mapped", String.valueOf(!rec.getMateUnmappedFlag())); w.writeAttribute("mate-strand", (rec.getMateNegativeStrandFlag() ? "-" : "+")); w.writeAttribute("proper-pair", String.valueOf(rec.getProperPairFlag())); } w.writeCharacters(String.valueOf(rec.getFlags())); w.writeEndElement(); if (!rec.getReadUnmappedFlag()) { w.writeStartElement("qual"); w.writeCharacters(String.valueOf(rec.getMappingQuality())); w.writeEndElement(); w.writeStartElement("chrom"); w.writeAttribute("index", String.valueOf(rec.getReferenceIndex())); w.writeCharacters(rec.getReferenceName()); w.writeEndElement(); w.writeStartElement("pos"); w.writeCharacters(String.valueOf(rec.getAlignmentStart())); w.writeEndElement(); w.writeStartElement("cigar"); w.writeCharacters(rec.getCigarString()); w.writeEndElement(); } if (!rec.getMateUnmappedFlag()) { w.writeStartElement("mate-chrom"); w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex())); w.writeCharacters(rec.getMateReferenceName()); w.writeEndElement(); w.writeStartElement("mate-pos"); w.writeCharacters(String.valueOf(rec.getMateAlignmentStart())); w.writeEndElement(); } if (!rec.getReadUnmappedFlag()) { if (genomicSequence == null || genomicSequence.getChrom().equals(rec.getReferenceName())) { genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName()); } w.writeStartElement("align"); int readIndex = 0; int refIndex = rec.getAlignmentStart(); for (final CigarElement e : rec.getCigar().getCigarElements()) { switch (e.getOperator()) { case H: break; // ignore hard clips case P: break; // ignore pads case I: // cont. case S: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); w.writeAttribute("read-index", String.valueOf(readIndex + 1)); if (readIndex >= 0 && readIndex < readbases.length) { w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex]))); } readIndex++; } break; } case N: // cont. -- reference skip case D: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); w.writeAttribute("ref-index", String.valueOf(refIndex)); if (refIndex >= 1 && refIndex <= genomicSequence.length()) { w.writeAttribute( "ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1))); } refIndex++; } break; } case M: case EQ: case X: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); char baseRead = '\0'; if (readIndex >= 0 && readIndex < readbases.length) { baseRead = (char) (rec.getReadBases()[readIndex]); w.writeAttribute("read-index", String.valueOf(readIndex + 1)); w.writeAttribute("read-base", String.valueOf(baseRead)); } w.writeAttribute("ref-index", String.valueOf(refIndex)); if (refIndex >= 1 && refIndex <= genomicSequence.length()) { char baseRef = genomicSequence.charAt(refIndex - 1); w.writeAttribute("ref-base", String.valueOf(baseRef)); if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) { w.writeAttribute("mismatch", "true"); } } refIndex++; readIndex++; } break; } default: throw new IllegalStateException( "Case statement didn't deal with cigar op: " + e.getOperator()); } } } w.writeEndElement(); w.writeEndElement(); iter.close(); w.writeEndElement(); } w.writeEndElement(); w.writeEndDocument(); w.flush(); w.close(); } catch (Exception err) { error(err); return -1; } finally { CloserUtil.close(samFileReader); CloserUtil.close(indexedFastaSequenceFile); } return 0; }
private boolean isHighQualityMapping(final SAMRecord record) { return !record.getReadFailsVendorQualityCheckFlag() && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD; }
private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) { // If the read isnt an aligned PF read then look at the read for no-calls if (record.getReadUnmappedFlag() || record.getReadFailsVendorQualityCheckFlag() || !doRefMetrics) { final byte[] readBases = record.getReadBases(); for (int i = 0; i < readBases.length; i++) { if (SequenceUtil.isNoCall(readBases[i])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } else if (!record.getReadFailsVendorQualityCheckFlag()) { final boolean highQualityMapping = isHighQualityMapping(record); if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++; final byte[] readBases = record.getReadBases(); final byte[] refBases = reference.getBases(); final byte[] qualities = record.getBaseQualities(); final int refLength = refBases.length; long mismatchCount = 0; long hqMismatchCount = 0; for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) { final int readIndex = alignmentBlock.getReadStart() - 1; final int refIndex = alignmentBlock.getReferenceStart() - 1; final int length = alignmentBlock.getLength(); for (int i = 0; i < length && refIndex + i < refLength; ++i) { final int readBaseIndex = readIndex + i; boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]); boolean bisulfiteBase = false; if (mismatch && isBisulfiteSequenced) { if ((record.getReadNegativeStrandFlag() && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g') && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a')) || ((!record.getReadNegativeStrandFlag()) && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c') && (readBases[readBaseIndex] == 'T') || readBases[readBaseIndex] == 't')) { bisulfiteBase = true; mismatch = false; } } if (mismatch) mismatchCount++; metrics.PF_ALIGNED_BASES++; if (!bisulfiteBase) nonBisulfiteAlignedBases++; if (highQualityMapping) { metrics.PF_HQ_ALIGNED_BASES++; if (!bisulfiteBase) hqNonBisulfiteAlignedBases++; if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) metrics.PF_HQ_ALIGNED_Q20_BASES++; if (mismatch) hqMismatchCount++; } if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } mismatchHistogram.increment(mismatchCount); hqMismatchHistogram.increment(hqMismatchCount); // Add any insertions and/or deletions to the global count for (final CigarElement elem : record.getCigar().getCigarElements()) { final CigarOperator op = elem.getOperator(); if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels; } } }