private final int removeDiscarded(ArrayList<Read> buffer1, ArrayList<Read> buffer2) { int removed = 0; if (buffer2 == null) { for (int i = 0; i < buffer1.size(); i++) { Read a = buffer1.get(i); if (a.discarded()) { buffer1.set(i, null); removed++; } } } else { for (int i = 0; i < buffer1.size(); i++) { Read a = buffer1.get(i); Read b = buffer2.get(i); if (a.discarded() || b.discarded()) { buffer1.set(i, null); buffer2.set(i, null); removed++; } } } if (removed > 0) { Tools.condenseStrict(buffer1); if (buffer2 != null) { Tools.condenseStrict(buffer2); } } return removed; }
@Override boolean processReadPair(Read r1, Read r2) { assert (r2 == null); final byte[] quals = r1.quality, bases = r1.bases; final byte[] match = (r1.match == null ? null : !r1.shortmatch() ? r1.match : Read.toLongMatchString(r1.match)); if (match == null || quals == null || bases == null) { return false; } int subs = 0; int indels = 0; for (int qpos = 0, mpos = 0, last = quals.length - 1; mpos < match.length; mpos++) { final byte m = match[mpos]; final byte mprev = match[Tools.max(mpos - 1, 0)]; final byte mnext = match[Tools.min(mpos + 1, match.length - 1)]; final byte q1 = quals[qpos]; final byte b2 = bases[qpos]; int sub = 0, indel = 0; if (m == 'S') { sub = 1; } else if (m == 'I') { indel = 1; } else if (m == 'm') { if (mprev == 'D' || mnext == 'D') { indel = 1; } } else if (m == 'D') { // do nothing } else if (m == 'C') { // do nothing } else { throw new RuntimeException( "Bad symbol m='" + ((char) m) + "'\n" + new String(match) + "\n" + new String(bases) + "\n"); } subs += sub; indels += indel; if (q1 >= minq && q1 <= maxq) { if (sub > 0 || (indel > 0 && countIndels)) { return true; } } if (m != 'D') { qpos++; } } return keepPerfect && subs == 0 && indels == 0; }
public Read consensus() { // TODO: Return single read if only 1. final int[][] counts = baseCounts(); final int width = counts[0].length; byte[] bases = new byte[width], quals = new byte[width]; for (int i = 0; i < width; i++) { int x = getConsensus(counts, i); if (x < 0) { // System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", // G="+counts[2][i]+", T="+counts[3][i]); bases[i] = 'N'; quals[i] = 0; } else { long q = 2 * counts[x][i] - counts[0][i] - counts[1][i] - counts[2][i] - counts[3][i]; // System.err.println("q="+q+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", // G="+counts[2][i]+", T="+counts[3][i]); bases[i] = AminoAcid.numberToBase[x]; quals[i] = (byte) Tools.mid(0, q, 50); } } Read leftmost = this.get(0); Read r = new Read(bases, quals, 0, leftmost.id); // TODO: Attach the long pair, and make sure the kmer location is correct. // assert(false) : "\n"+r.toFastq()+"\nCheck kmer location."; // assert(size()==1) : // "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n"; return r; }
@Override public boolean parseArgument(String arg, String a, String b) { // System.err.println("Calling parseArgument("+arg+","+a+","+b+")"); if (a.equals("minq")) { minq = (int) Tools.parseKMG(b); return true; } else if (a.equals("maxq")) { maxq = (int) Tools.parseKMG(b); return true; } else if (a.equals("keepperfect")) { keepPerfect = Tools.parseBoolean(b); return true; } else if (a.equals("countindels")) { countIndels = Tools.parseBoolean(b); return true; } // There was no match to the argument return false; }
private final void pair(ArrayList<Read> buffer1, ArrayList<Read> buffer2) { final int len1 = buffer1.size(), len2 = buffer2.size(); assert (ALLOW_UNEQUAL_LENGTHS || len1 == len2) : "\nThere appear to be different numbers of reads in the paired input files." + "\nThe pairing may have been corrupted by an upstream process. It may be fixable by running repair.sh."; final int lim = Tools.min(len1, len2); for (int i = 0; i < lim; i++) { Read a = buffer1.get(i); Read b = buffer2.get(i); assert (a.numericID == b.numericID) : "\n" + a.numericID + ", " + b.numericID + "\n" + a.toText(false) + "\n" + b.toText(false) + "\n"; assert (a.mate == null) : "Please set interleaved=false when using dual input files.\n" + a.id + "\n" + a.mate.id + "\n" + b.id + "\n" + producer1 + "\n" + producer2; assert (b.mate == null) : "Please set interleaved=false when using dual input files."; a.mate = b; b.mate = a; assert (a.pairnum() == 0); b.setPairnum(1); // assert(a.pairnum()!=b.pairnum()); } if (len1 > len2) { // do nothing; } else if (len2 > len1) { for (int i = lim; i < len2; i++) { Read b = buffer2.get(i); b.setPairnum(0); buffer1.add(b); } } }
/** This will create a count consensus of the bases at each position in the cluster. */ public int[][] baseCounts() { int maxLeft = -1, maxRight = -1; for (Read r : this) { long[] obj = (long[]) r.obj; int pos = (int) obj[1]; maxLeft = Tools.max(maxLeft, pos); maxRight = Tools.max(maxRight, r.length() - pos); } final int width = maxLeft + maxRight; // assert(size()==1) : "\nleft="+maxLeft+", right="+maxRight+", width="+width+", // "+k+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq(); // System.err.println("\n\n"); final int[][] counts = new int[4][width]; for (Read r : this) { long[] obj = (long[]) r.obj; int pos = (int) obj[1]; byte[] bases = r.bases, quals = r.quality; // System.err.println("pos="+pos+", maxLeft="+maxLeft); for (int cloc = 0, rloc = maxLeft - pos; cloc < bases.length; cloc++, rloc++) { // System.err.println("cloc="+cloc+"/"+bases.length+", rloc="+rloc+"/"+width); int x = AminoAcid.baseToNumber[bases[cloc]]; if (x > -1) { int q = (quals == null ? 20 : quals[cloc]); counts[x][rloc] += q; } } } // if(size()>0){//Looks correct. // System.err.println(Arrays.toString(counts[0])); // System.err.println(Arrays.toString(counts[1])); // System.err.println(Arrays.toString(counts[2])); // System.err.println(Arrays.toString(counts[3])); // } return counts; }
public void fill(Read r, long[] kmers) { final byte[] bases = r.bases; long kmer = 0; long rkmer = 0; int len = 0; if (bases == null || bases.length < k) { return; } long[] mods = local2.get(); if (mods == null) { mods = new long[comparisons]; local2.set(mods); } Arrays.fill(mods, -1); for (int i = 0; i < bases.length; i++) { byte b = bases[i]; long x = Dedupe.baseToNumber[b]; long x2 = Dedupe.baseToComplementNumber[b]; kmer = ((kmer << 2) | x) & mask; rkmer = (rkmer >>> 2) | (x2 << shift2); if (b == 'N') { len = 0; } else { len++; } if (len >= k) { final long kmax = Tools.max(kmer, rkmer); for (int j = 0; j < comparisons; j++) { final long div = divisors[j]; final long mod = kmax % div; if (mod > mods[j]) { mods[j] = mod; kmers[j] = kmax; } } } } }
public static void main(String[] args) { Timer t = new Timer(); String in = args[0]; String outF = args.length > 1 ? args[1] : null; String outR = args.length > 2 ? args[2] : null; String outU = args.length > 3 ? args[3] : null; if (args.length > 4) { if (args[4].equalsIgnoreCase("header")) { includeHeader = true; } } ByteFile tf = ByteFile.makeByteFile(in, true, false); Tools.testForDuplicateFiles(true, in, outF, outR, outU); Tools.testOutputFiles(true, false, false, outF, outR, outU); final ByteStreamWriter fStream, rStream, uStream; fStream = (outF == null ? null : new ByteStreamWriter(outF, true, false, true)); rStream = (outR == null ? null : new ByteStreamWriter(outR, true, false, true)); uStream = (outU == null ? null : new ByteStreamWriter(outU, true, false, true)); if (fStream != null) { fStream.start(); } if (rStream != null) { rStream.start(); } if (uStream != null) { uStream.start(); } long plus = 0; long minus = 0; long other = 0; byte[] s = null; for (s = tf.nextLine(); s != null; s = tf.nextLine()) { if (s.length > 0) { byte c = s[0]; if (c == '@') { if (includeHeader) { if (fStream != null) { fStream.println(s); } if (rStream != null) { rStream.println(s); } if (uStream != null) { uStream.println(s); } } } else { int flag = SamLine.parseFlagOnly(s); if (SamLine.mapped(flag)) { if (SamLine.strand(flag) == 0) { if (fStream != null) { fStream.println(s); } plus++; } else { if (rStream != null) { rStream.println(s); } minus++; } } else { if (uStream != null) { uStream.println(s); } other++; } } } } tf.close(); if (fStream != null) { fStream.poisonAndWait(); } if (rStream != null) { rStream.poisonAndWait(); } if (uStream != null) { uStream.poisonAndWait(); } System.err.println("Total reads: \t" + (plus + minus + other)); System.err.println("Plus reads: \t" + (plus)); System.err.println("Minus reads: \t" + (minus)); System.err.println("Unmapped reads:\t" + (other)); t.stop(); System.err.println("Time: \t" + t); }
public static void main(String[] args) { System.err.println( "Executing " + (new Object() {}.getClass().getEnclosingClass().getName()) + " " + Arrays.toString(args) + "\n"); Timer t = new Timer(); String inPattern = args[0]; int minChrom = -1; int maxChrom = -1; int outgenome = -1; Data.GENOME_BUILD = -1; String name = null; for (int i = 1; i < args.length; i++) { final String arg = args[i].toLowerCase(); String[] split = arg.split("="); String a = split[0]; String b = (split.length > 1 ? split[1] : null); if (a.equals("ingenome")) { Data.setGenome(Integer.parseInt(b)); if (minChrom == -1) { minChrom = 1; } if (maxChrom == -1) { maxChrom = Data.numChroms; } } else if (a.equals("outgenome")) { outgenome = Integer.parseInt(b); } else if (a.equals("minchrom")) { minChrom = Integer.parseInt(b); } else if (a.equals("maxchrom")) { maxChrom = Integer.parseInt(b); } else if (a.equals("threads") || a.equals("t")) { THREADS = Integer.parseInt(b); } else if (a.equals("nblocksize")) { N_BLOCK_SIZE = Integer.parseInt(b); } else if (a.equals("nblocktrigger")) { N_BLOCK_TRIGGER = Integer.parseInt(b); } else if (a.equals("staynearref")) { STAY_NEAR_REF = Tools.parseBoolean(b); } else if (a.equals("append") || a.equals("app")) { append = ReadStats.append = Tools.parseBoolean(b); } else if (a.equals("overwrite") || a.equals("ow")) { overwrite = Tools.parseBoolean(b); } else if (a.startsWith("regen")) { REGEN_N_BLOCKS = Tools.parseBoolean(b); } else if (a.startsWith("name=")) { REGEN_N_BLOCKS = Tools.parseBoolean(b); } else { System.err.println("Unknown argument " + arg); } } assert (Data.GENOME_BUILD > -1); assert (outgenome > -1); // assert(Data.GENOME_BUILD!=outgenome); if (Data.GENOME_BUILD == outgenome) { System.out.println("Warning! Overwriting input genome " + outgenome); } String fname = Data.chromFname(minChrom, outgenome); File f = new File(fname.substring(0, fname.lastIndexOf('/'))); // assert(false) : f.getAbsolutePath(); if (!f.exists()) { f.mkdirs(); } for (int chrom = minChrom; chrom <= maxChrom; chrom++) { String outName = Data.chromFname(chrom, outgenome); assert (overwrite || !new File(outName).exists()) : "Destination " + outName + " already exists."; // assert(false) : inPattern+", "+outName; process(inPattern.replaceFirst("#", "" + chrom), outName, chrom); } FastaToChromArrays2.writeInfo( outgenome, maxChrom, (name == null ? Data.name : name), "" + Data.GENOME_BUILD + "_plus_variations", false, false); t.stop(); { String path = IndexMaker4.fname(1, 1, 12, 1); int lastSlash = path.lastIndexOf('/'); path = path.substring(0, lastSlash); File dir = new File(path); if (dir.exists()) { System.out.println("Deleting old index for " + outgenome); for (File f2 : dir.listFiles()) { if (f2.isFile() && (f2.getName().contains(".int2d") || f2.getName().endsWith(".txt"))) { f2.delete(); } } } } // System.out.println("Vars in: \t"+VARS_IN); // System.out.println("Vars out:\t"+VARS_OUT); System.out.println(); System.out.println("Time: \t" + t); }
/** * @param replaceFirst * @param chromFname * @param chrom */ public static void process(String inVarsName, String outChromName, int chrom) { ArrayList<Varlet> vars = Varlet.fromTextFile(inVarsName); ChromosomeArray cha = Data.getChromosome(chrom); ChromosomeArray chb = new ChromosomeArray(chrom, Gene.PLUS); // Next location to read in a int aloc = 0; // Next location to set in b int bloc = 0; for (int i = 0; i < vars.size(); i++) { Varlet v = vars.get(i); assert (v.beginLoc >= aloc) : i + "\n" + vars.get(i - 1) + "\n" + v + "\n"; // Overlapping variations while (v.beginLoc < aloc) { // skip it, for now. System.err.print("e"); i++; if (i >= vars.size()) { break; } v = vars.get(i); } if (STAY_NEAR_REF && Tools.absdif(aloc, bloc) >= REF_LIMIT) { int dif = v.lengthDif(); if (aloc < bloc) { // skip insertions while (dif > 0) { // System.err.print("i"); i++; if (i >= vars.size()) { break; } v = vars.get(i); dif = v.lengthDif(); } } else { // skip deletions while (dif < 0) { // System.err.print("d"); i++; if (i >= vars.size()) { break; } v = vars.get(i); dif = v.lengthDif(); } } } // Advance to variation's beginning while (aloc < v.beginLoc) { byte b = cha.get(aloc); chb.set(bloc, b); aloc++; bloc++; } // Apply variation if (v.varType == Variation.SNP) { String call = v.call; String ref = v.ref; if (ref != null && ref.equals("=")) { ref = null; } for (int j = 0; j < call.length(); j++) { char c = call.charAt(j); if (ref != null) { assert (ref.charAt(j) == cha.get(aloc)) : "\n" + i + ", " + v; } chb.set(bloc, c); aloc++; bloc++; } } else if (v.varType == Variation.DELINS) { String call = v.call; for (int j = 0; j < call.length(); j++) { char c = call.charAt(j); chb.set(bloc, c); bloc++; } aloc += v.lengthRef(); } else if (v.varType == Variation.NOCALL) { // Do nothing. But, it should have been removed already. if (!foundNocall) { System.err.println("*** Warning - found a nocall in input variations ***"); foundNocall = true; } } else if (v.varType == Variation.NOREF) { String call = v.call; for (int j = 0; j < call.length(); j++) { char c = call.charAt(j); assert (cha.get(aloc) == 'N') : cha.get(aloc); chb.set(bloc, c); aloc++; bloc++; } } else if (v.varType == Variation.INS) { String call = v.call; for (int j = 0; j < call.length(); j++) { char c = call.charAt(j); chb.set(bloc, c); bloc++; } } else if (v.varType == Variation.DEL) { int len = v.lengthRef(); assert (len > 0); aloc += len; } } // Finish writing array while (aloc < cha.array.length || aloc <= cha.maxIndex) { byte c = cha.get(aloc); chb.set(bloc, c); aloc++; bloc++; } System.out.println("Length Shift for chr" + chrom + ": \t" + (bloc - aloc)); Data.unload(chrom, true); cha = null; if (REGEN_N_BLOCKS) { chb = regenNBlocks(chb, N_BLOCK_SIZE, N_BLOCK_TRIGGER, N_BLOCK_END_SIZE); } chb.resize(chb.maxIndex + 1); // Can't do this because it is read later // if(THREADS==1){ReadWrite.writeObjectInThread(cac, outChromName);} // else{ReadWrite.write(cac, outChromName);} ReadWrite.write(chb, outChromName, false); }
public static void main(String[] args) { String in1 = args[0]; String in2 = (args.length < 2 || args[1].equalsIgnoreCase("null") || args[1].contains("=") ? null : args[1]); if (in2 != null) { assert (!in1.equalsIgnoreCase(in2)); FASTQ.TEST_INTERLEAVED = false; } else { FASTQ.TEST_INTERLEAVED = true; FASTQ.FORCE_INTERLEAVED = true; } long maxReads = -1; for (int i = 1; i < args.length; i++) { final String arg = args[i]; final String[] split = arg.split("="); String a = split[0].toLowerCase(); String b = (split.length > 1 ? split[1] : "true"); if (Parser.isJavaFlag(arg)) { // jvm argument; do nothing } else if (Parser.parseZip(arg, a, b)) { // do nothing } else if (Parser.parseQuality(arg, a, b)) { // do nothing } else if (Parser.parseFasta(arg, a, b)) { // do nothing } else if (a.equals("reads") || a.startsWith("maxreads")) { maxReads = Tools.parseKMG(b); } else { throw new RuntimeException("Unknown parameter " + args[i]); } } Parser.processQuality(); assert (FastaReadInputStream.settingsOK()); Timer t = new Timer(); ConcurrentReadInputStream cris = getReadInputStream(maxReads, false, true, in1, in2); System.out.println("Fetched " + cris.getClass().getName()); { Object[] p = cris.producers(); // while(p[0]==null){ // p=cris.producers(); // } System.out.print("Producers: "); String comma = ""; for (Object o : p) { System.out.print(comma + (o == null ? "null" : o.getClass().getName())); comma = ", "; } System.out.println(); } boolean paired = cris.paired(); System.out.println("paired=" + paired); cris.start(); // 4567 ListNum<Read> ln = cris.nextList(); ArrayList<Read> reads = (ln != null ? ln.list : null); if (reads != null && !reads.isEmpty()) { Read r = reads.get(0); assert ((r.mate != null) == paired); } long readCount = 0; long baseCount = 0; while (reads != null && reads.size() > 0) { for (Read r : reads) { Read r2 = r.mate; if (r != null) { readCount++; if (r.bases != null) { baseCount += r.length(); } } if (r2 != null) { readCount++; if (r2.bases != null) { baseCount += r2.length(); } } } cris.returnList(ln.id, ln.list.isEmpty()); // System.err.println("fetching list"); ln = cris.nextList(); reads = (ln != null ? ln.list : null); // System.out.println("reads: "+(reads==null ? "null" : reads.size())); } System.err.println("Finished reading"); cris.returnList(ln.id, ln.list.isEmpty()); cris.close(); t.stop(); System.out.println("Reads: \t" + readCount); System.out.println("Bases: \t" + baseCount); System.out.println("Avg Length: \t" + String.format("%.2f", baseCount * 1.0 / readCount)); System.out.println("Time: \t" + t); }