public static void main(String[] argv) { SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT); // STFU SAMResource sr = null; SAMRegion region = new SAMRegion(); region.range = new Range(); region.range.start = -1; region.range.end = -1; String outfile = null; String target_file = null; SAMCoverage sc = new SAMCoverage(); for (int i = 0; i < argv.length; i++) { if (argv[i].equals("-bam")) { sr = new SAMResource(); // sr.import_data(SAMResourceTags.SAM_URL, argv[++i]); sr.set_file(argv[++i]); sr.detect_sample_id(); } else if (argv[i].equals("-targets")) { target_file = argv[++i]; } else if (argv[i].equals("-tname")) { region.tname = argv[++i]; } else if (argv[i].equals("-verbose")) { sc.set_verbose(true); } else if (argv[i].equals("-tstart")) { region.range.start = Integer.parseInt(argv[++i]); } else if (argv[i].equals("-tend")) { region.range.end = Integer.parseInt(argv[++i]); } else if (argv[i].equals("-of")) { outfile = argv[++i]; } else if (argv[i].equals("-min-quality")) { sc.set_min_quality(Integer.parseInt(argv[++i])); } else { System.err.println("error: unknown switch " + argv[i]); // debug System.exit(1); } } String error = null; if (sr == null) { error = "specify -bam [file]"; } else if (target_file == null) { if (region.tname == null) { error = "specify -tname"; } else if (region.range.start == -1) { error = "specify -tstart"; } else if (region.range.end == -1) { error = "specify -tend"; } } sr.set_region(region); if (error != null) { System.err.println("ERROR: " + error); // debug } else if (target_file != null) { try { File f = new File(target_file); BufferedReader br = new BufferedReader(new FileReader(f)); String line = br.readLine(); String[] headers = line.split("\t"); if (headers[0].equals("Name") && headers[1].equals("Chromosome") && headers[2].equals("Start") && headers[3].equals("End")) { WorkingFile wf = null; if (outfile != null) { wf = new WorkingFile(outfile); sc.setPrintStream(wf.getPrintStream()); } while (true) { line = br.readLine(); if (line == null) { // EOF break; } else { String[] row = line.split("\t"); region.tname = row[1]; region.range.start = Integer.parseInt(row[2]); region.range.end = Integer.parseInt(row[3]); // sc.set_name(new String(row[0])); sc.set_name(new String(row[0]) + "," + new String(row[1])); sc.find_coverage(sr); } } if (outfile != null) wf.finish(); } else { throw new IOException("file format error"); } } catch (Exception e) { System.err.println("ERROR: " + e); // debug e.printStackTrace(); System.exit(1); } } else { sc.set_outfile(outfile); sc.find_coverage(sr); } }
public void find_coverage(SAMResource sres) { int start_base = sres.region.range.start; int end_base = sres.region.range.end; int coverage_len = (end_base - start_base) + 1; int i, end, ref_i, read_i, len; int[] coverage = new int[coverage_len]; Arrays.fill(coverage, 0); WorkingFile wf = null; if (outfile != null) { try { wf = new WorkingFile(outfile); ps = wf.getPrintStream(); } catch (Exception e) { System.err.println("I/O error: " + e); // debug e.printStackTrace(); System.exit(1); } } try { // // gather coverage info: // CloseableIterator<SAMRecord> iterator = sres.get_iterator(); int read_count = 0; int ref_min = -1; int ref_max = -1; while (iterator.hasNext()) { SAMRecord sr = iterator.next(); read_count++; // System.err.println(sr.getReadName() + ": " + sr.getAlignmentStart() + "-" + // sr.getAlignmentEnd()); // debug if (sr.getReadUnmappedFlag()) continue; if (sr.getDuplicateReadFlag()) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " ignoring, duplicate"); continue; } byte[] read = sr.getReadBases(); byte[] quals = sr.getBaseQualities(); for (AlignmentBlock ab : sr.getAlignmentBlocks()) { len = ab.getLength(); read_i = ab.getReadStart() - 1; ref_i = ab.getReferenceStart() - start_base; if (ref_min == -1 || ref_i < ref_min) ref_min = ref_i; for (i = read_i, end = read_i + len; i < end; i++, ref_i++) { if (ref_i >= 0 && ref_i < coverage_len) { if (quals[i] >= MIN_QUALITY) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " hit at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); coverage[ref_i]++; } else if (verbose_mode) { System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " qual_reject at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); } } } if (ref_max == -1 || ref_i > ref_max) ref_max = ref_i; } } sres.close(); System.err.println( "records:" + read_count + " ref_min:" + (ref_min + start_base) + " ref_max:" + (ref_max + start_base)); // debug // // report coverage info: // for (i = 0; i < coverage.length; i++) { if (name != null) ps.print(name + ","); ps.println((i + start_base) + "," + coverage[i]); // debug } if (wf != null) wf.finish(); } catch (Exception e) { System.err.println("ERROR: " + e); // debug e.printStackTrace(); } }
private static void updateSAM( SAMRecord rec, ReferenceSequence sequence, SAMProgramRecord programRecord, AlignHeapNode bestAlignHeapNode, SRMAUtil.Space space, String read, String qualities, String softClipStartBases, String softClipStartQualities, String softClipEndBases, String softClipEndQualities, boolean strand, boolean correctBases) throws Exception { AlignHeapNode curAlignHeapNode = null; AlignHeapNode prevAlignHeapNode = null; int alignmentStart = 0; int readIndex = -1; byte readBases[] = null; byte baseQualities[] = null; byte colorErrors[] = null; int i; int numEdits = 0; List<String> optFieldTags = new LinkedList<String>(); List<Object> optFieldValues = new LinkedList<Object>(); Object attr; // Debugging stuff String readName = rec.getReadName(); if (null == bestAlignHeapNode) { // Do not modify the alignment return; } // To generate a new CIGAR List<CigarElement> cigarElements = null; CigarOperator prevCigarOperator = null, curCigarOperator = null; int prevCigarOperatorLength = 0; // TODO // setInferredInsertSize (invalidates paired end reads) // setMappingQuality (?) // setFlag // update base qualities for color space reads // clear attributes, but save some Align.clearAttributes(rec, optFieldTags, optFieldValues); readBases = new byte[read.length()]; baseQualities = new byte[qualities.length()]; for (i = 0; i < qualities.length(); i++) { // Must subtract 33 for PHRED scaling baseQualities[i] = (byte) (qualities.charAt(i) - 33); } if (strand) { readIndex = 0; } else { readIndex = read.length() - 1; } cigarElements = new LinkedList<CigarElement>(); if (strand) { // reverse strand is the current position alignmentStart = bestAlignHeapNode.node.position; } else { alignmentStart = bestAlignHeapNode.startPosition; } assert null != bestAlignHeapNode; curAlignHeapNode = bestAlignHeapNode; while (null != curAlignHeapNode) { // Get the current cigar operator if (null != prevAlignHeapNode && CigarOperator.DELETION != prevCigarOperator && 1 < Math.abs(curAlignHeapNode.node.position - prevAlignHeapNode.node.position)) { curCigarOperator = CigarOperator.DELETION; } else { switch (curAlignHeapNode.node.type) { case Node.MISMATCH: // Fall through case Node.MATCH: curCigarOperator = CigarOperator.MATCH_OR_MISMATCH; break; case Node.INSERTION: // System.out.println("INS"); curCigarOperator = CigarOperator.INSERTION; break; default: throw new Exception("Unknown node type"); } if (space == SRMAUtil.Space.COLORSPACE || correctBases) { readBases[readIndex] = (byte) curAlignHeapNode.node.base; if (strand) { readIndex++; } else { readIndex--; } // count the number of mismatches switch (curAlignHeapNode.node.type) { case Node.MISMATCH: case Node.INSERTION: numEdits++; break; default: break; } } else { // count the number of mismatches switch (curAlignHeapNode.node.type) { case Node.MATCH: if (read.charAt(curAlignHeapNode.readOffset) != curAlignHeapNode.node.base) { numEdits++; } break; case Node.MISMATCH: // Fall through if (read.charAt(curAlignHeapNode.readOffset) != sequence.getBases()[curAlignHeapNode.node.position - 1]) { numEdits++; } break; case Node.INSERTION: numEdits++; break; default: break; } } } if (prevCigarOperator != curCigarOperator) { // different cigar operator // add the previous cigar operator if (null != prevCigarOperator) { if (strand) { // reverse // append cigarElements.add(new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } else { // prepend cigarElements.add(0, new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } } // update prevCigarOperator prevCigarOperator = curCigarOperator; if (curCigarOperator == CigarOperator.DELETION) { // length of deletion prevCigarOperatorLength = Math.abs(curAlignHeapNode.node.position - prevAlignHeapNode.node.position) - 1; numEdits += prevCigarOperatorLength; // deletions } else { prevCigarOperatorLength = 1; } } else { // same cigar operator prevCigarOperatorLength++; } // Update if (CigarOperator.DELETION != curCigarOperator) { prevAlignHeapNode = curAlignHeapNode; curAlignHeapNode = curAlignHeapNode.prev; } } if (0 < prevCigarOperatorLength) { if (null == prevCigarOperator || CigarOperator.DELETION == prevCigarOperator) { throw new Exception("Ended with a null cigar operator or a deletion cigar operator"); } if (strand) { // reverse // append cigarElements.add(new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } else { // prepend cigarElements.add(0, new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } } if (space == SRMAUtil.Space.COLORSPACE) { // color space, read bases already inferred // Get color error string colorErrors = new byte[read.length()]; char prevBase = SRMAUtil.COLORSPACE_ADAPTOR; if (strand) { // reverse for (i = 0; i < read.length(); i++) { char nextBase = SRMAUtil.colorSpaceNextBase(prevBase, read.charAt(i)); if (nextBase == SRMAUtil.getCompliment((char) readBases[read.length() - i - 1])) { colorErrors[i] = (byte) Alignment.GAP; } else { colorErrors[i] = (byte) read.charAt(i); } if (0 < i) { // qualities are assumed to be always in the same direction as the color errors baseQualities[read.length() - i] = getColorQuality( colorErrors[i - 1], colorErrors[i], (byte) (qualities.charAt(i - 1) - 33), (byte) (qualities.charAt(i) - 33)); } prevBase = SRMAUtil.getCompliment((char) readBases[read.length() - i - 1]); } // last color baseQualities[0] = (byte) (qualities.charAt(read.length() - 1) - 33); } else { for (i = 0; i < read.length(); i++) { char nextBase = SRMAUtil.colorSpaceNextBase(prevBase, read.charAt(i)); if (nextBase == readBases[i]) { colorErrors[i] = (byte) Alignment.GAP; } else { colorErrors[i] = (byte) read.charAt(i); } if (0 < i) { baseQualities[i - 1] = getColorQuality( colorErrors[i - 1], colorErrors[i], (byte) (qualities.charAt(i - 1) - 33), (byte) (qualities.charAt(i) - 33)); } prevBase = (char) readBases[i]; } // last color baseQualities[read.length() - 1] = (byte) (qualities.charAt(read.length() - 1) - 33); } } else if (correctBases) { // bases were corrected if (strand) { for (i = 0; i < read.length(); i++) { if (readBases[i] == (byte) read.charAt(read.length() - i - 1)) { baseQualities[i] = (byte) (qualities.charAt(read.length() - i - 1) - 33); } else { // TODO: how much to down-weight ? baseQualities[i] = (byte) (SRMAUtil.QUAL2CHAR( SRMAUtil.CHAR2QUAL(qualities.charAt(read.length() - i - 1)) - CORRECT_BASE_QUALITY_PENALTY) - 33); if (baseQualities[i] <= 0) { baseQualities[i] = 1; } } } } else { for (i = 0; i < read.length(); i++) { if (readBases[i] == (byte) read.charAt(i)) { baseQualities[i] = (byte) (qualities.charAt(i) - 33); } else { // TODO: how much to down-weight ? baseQualities[i] = (byte) (SRMAUtil.QUAL2CHAR( SRMAUtil.CHAR2QUAL(qualities.charAt(i)) - CORRECT_BASE_QUALITY_PENALTY) - 33); if (baseQualities[i] <= 0) { baseQualities[i] = 1; } } } } rec.setAttribute("XO", read); rec.setAttribute("XQ", qualities); } else { // bases not corrected readBases = new byte[read.length()]; baseQualities = new byte[qualities.length()]; // qualities.length() == read.length() if (strand) { // reverse for (i = 0; i < read.length(); i++) { readBases[i] = (byte) read.charAt(read.length() - i - 1); baseQualities[i] = (byte) (qualities.charAt(read.length() - i - 1) - 33); } } else { for (i = 0; i < read.length(); i++) { readBases[i] = (byte) read.charAt(i); baseQualities[i] = (byte) (qualities.charAt(i) - 33); } } } // Add in soft-clipping if (null != softClipStartBases) { // prepend cigarElements.add(0, new CigarElement(softClipStartBases.length(), CigarOperator.S)); byte tmpBases[] = new byte[readBases.length + softClipStartBases.length()]; System.arraycopy(readBases, 0, tmpBases, softClipStartBases.length(), readBases.length); readBases = tmpBases; for (i = 0; i < softClipStartBases.length(); i++) { readBases[i] = (byte) softClipStartBases.charAt(i); } byte tmpQualities[] = new byte[baseQualities.length + softClipStartQualities.length()]; System.arraycopy( baseQualities, 0, tmpQualities, softClipStartQualities.length(), baseQualities.length); baseQualities = tmpQualities; for (i = 0; i < softClipStartQualities.length(); i++) { baseQualities[i] = (byte) softClipStartQualities.charAt(i); } } if (null != softClipEndBases) { // append cigarElements.add(new CigarElement(softClipEndBases.length(), CigarOperator.S)); byte tmpBases[] = new byte[readBases.length + softClipEndBases.length()]; System.arraycopy(readBases, 0, tmpBases, 0, readBases.length); for (i = 0; i < softClipEndBases.length(); i++) { tmpBases[i + readBases.length] = (byte) softClipEndBases.charAt(i); } readBases = tmpBases; byte tmpQualities[] = new byte[baseQualities.length + softClipEndQualities.length()]; System.arraycopy(baseQualities, 0, tmpQualities, 0, baseQualities.length); for (i = 0; i < softClipEndQualities.length(); i++) { tmpQualities[i + baseQualities.length] = (byte) softClipEndQualities.charAt(i); } baseQualities = tmpQualities; } // Update SAM record rec.setCigar(new Cigar(cigarElements)); rec.setAlignmentStart(alignmentStart); rec.setReadBases(readBases); rec.setBaseQualities(baseQualities); // Reset saved attributes Align.resetAttributes(rec, optFieldTags, optFieldValues); // Set new attributes if (space == SRMAUtil.Space.COLORSPACE) { // set the XE attribute for colorError string rec.setAttribute("XE", new String(colorErrors)); } rec.setAttribute("AS", bestAlignHeapNode.score); rec.setAttribute("XC", bestAlignHeapNode.alleleCoverageSum); rec.setAttribute("PG", programRecord.getId()); rec.setAttribute("NM", numEdits); }