public void find_coverage(SAMResource sres) { int start_base = sres.region.range.start; int end_base = sres.region.range.end; int coverage_len = (end_base - start_base) + 1; int i, end, ref_i, read_i, len; int[] coverage = new int[coverage_len]; Arrays.fill(coverage, 0); WorkingFile wf = null; if (outfile != null) { try { wf = new WorkingFile(outfile); ps = wf.getPrintStream(); } catch (Exception e) { System.err.println("I/O error: " + e); // debug e.printStackTrace(); System.exit(1); } } try { // // gather coverage info: // CloseableIterator<SAMRecord> iterator = sres.get_iterator(); int read_count = 0; int ref_min = -1; int ref_max = -1; while (iterator.hasNext()) { SAMRecord sr = iterator.next(); read_count++; // System.err.println(sr.getReadName() + ": " + sr.getAlignmentStart() + "-" + // sr.getAlignmentEnd()); // debug if (sr.getReadUnmappedFlag()) continue; if (sr.getDuplicateReadFlag()) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " ignoring, duplicate"); continue; } byte[] read = sr.getReadBases(); byte[] quals = sr.getBaseQualities(); for (AlignmentBlock ab : sr.getAlignmentBlocks()) { len = ab.getLength(); read_i = ab.getReadStart() - 1; ref_i = ab.getReferenceStart() - start_base; if (ref_min == -1 || ref_i < ref_min) ref_min = ref_i; for (i = read_i, end = read_i + len; i < end; i++, ref_i++) { if (ref_i >= 0 && ref_i < coverage_len) { if (quals[i] >= MIN_QUALITY) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " hit at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); coverage[ref_i]++; } else if (verbose_mode) { System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " qual_reject at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); } } } if (ref_max == -1 || ref_i > ref_max) ref_max = ref_i; } } sres.close(); System.err.println( "records:" + read_count + " ref_min:" + (ref_min + start_base) + " ref_max:" + (ref_max + start_base)); // debug // // report coverage info: // for (i = 0; i < coverage.length; i++) { if (name != null) ps.print(name + ","); ps.println((i + start_base) + "," + coverage[i]); // debug } if (wf != null) wf.finish(); } catch (Exception e) { System.err.println("ERROR: " + e); // debug e.printStackTrace(); } }
private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) { // If the read isnt an aligned PF read then look at the read for no-calls if (record.getReadUnmappedFlag() || record.getReadFailsVendorQualityCheckFlag() || !doRefMetrics) { final byte[] readBases = record.getReadBases(); for (int i = 0; i < readBases.length; i++) { if (SequenceUtil.isNoCall(readBases[i])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } else if (!record.getReadFailsVendorQualityCheckFlag()) { final boolean highQualityMapping = isHighQualityMapping(record); if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++; final byte[] readBases = record.getReadBases(); final byte[] refBases = reference.getBases(); final byte[] qualities = record.getBaseQualities(); final int refLength = refBases.length; long mismatchCount = 0; long hqMismatchCount = 0; for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) { final int readIndex = alignmentBlock.getReadStart() - 1; final int refIndex = alignmentBlock.getReferenceStart() - 1; final int length = alignmentBlock.getLength(); for (int i = 0; i < length && refIndex + i < refLength; ++i) { final int readBaseIndex = readIndex + i; boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]); boolean bisulfiteBase = false; if (mismatch && isBisulfiteSequenced) { if ((record.getReadNegativeStrandFlag() && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g') && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a')) || ((!record.getReadNegativeStrandFlag()) && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c') && (readBases[readBaseIndex] == 'T') || readBases[readBaseIndex] == 't')) { bisulfiteBase = true; mismatch = false; } } if (mismatch) mismatchCount++; metrics.PF_ALIGNED_BASES++; if (!bisulfiteBase) nonBisulfiteAlignedBases++; if (highQualityMapping) { metrics.PF_HQ_ALIGNED_BASES++; if (!bisulfiteBase) hqNonBisulfiteAlignedBases++; if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) metrics.PF_HQ_ALIGNED_Q20_BASES++; if (mismatch) hqMismatchCount++; } if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } mismatchHistogram.increment(mismatchCount); hqMismatchHistogram.increment(hqMismatchCount); // Add any insertions and/or deletions to the global count for (final CigarElement elem : record.getCigar().getCigarElements()) { final CigarOperator op = elem.getOperator(); if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels; } } }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }