/** * This need to be thread safe * * @param seq * @param outStream * @throws IOException */ private void processReads(Sequence seq, PrintStream outStream) throws IOException { NuclKmerGenerator kmerGenerator; Kmer kmer; boolean found = false; kmerGenerator = new NuclKmerGenerator(seq.getSeqString(), kmerSize); while (kmerGenerator.hasNext()) { kmer = kmerGenerator.next(); for (int i = 0; i < kmerMaps.length; i++) { // for forward and reverse direction KmerAbund kmerAbund = kmerMaps[i].get(kmer); if (kmerAbund != null) { // increment the count kmerAbund.count.addAndGet(1); found = true; } } } if (found) { totalReads.incrementAndGet(); } if (outStream != null && found) { writeSeq(seq, outStream); } }
/** * This is for JUNIT test * * @param kmerSize * @param contigReader * @param readsReader * @param match_reads_out * @throws IOException */ public KmerCoverage( int kmerSize, SequenceReader contigReader, SeqReaderCore readsReader, PrintStream outStream) throws IOException { kmerMaps[0] = new ConcurrentHashMap<Kmer, KmerAbund>(); // kmer map for the forward direction kmerMaps[1] = new ConcurrentHashMap<Kmer, KmerAbund>(); // kmer map for the reverse direction this.kmerSize = kmerSize; processContigFile(contigReader); Sequence seq; while ((seq = readsReader.readNextSequence()) != null) { if (seq.getSeqString().length() < kmerSize) { continue; } processReads(seq, outStream); } readsReader.close(); if (outStream != null) { outStream.close(); } }
/** * find the kmers in the contigs * * @param reader * @throws IOException */ private void processContigFile(SequenceReader reader) throws IOException { Sequence seq; NuclKmerGenerator kmerGenerator; Kmer kmer; int contigIdx = 0; while ((seq = reader.readNextSequence()) != null) { if (seq.getSeqString().length() < kmerSize) { continue; } // use int to represent seqname in case contig names are too long contigMap.put( contigIdx, new Contig(seq.getSeqName(), seq.getSeqString().length() - kmerSize + 1)); // forward direction kmerGenerator = new NuclKmerGenerator(seq.getSeqString(), kmerSize); while (kmerGenerator.hasNext()) { kmer = kmerGenerator.next(); KmerAbund kmerAbund = kmerMaps[0].get(kmer); if (kmerAbund == null) { kmerAbund = new KmerAbund(); kmerMaps[0].put(kmer, kmerAbund); } kmerAbund.contigList.add(new ContigCoverage(contigIdx, kmerGenerator.getPosition() - 1)); } // reverse direction kmerGenerator = new NuclKmerGenerator(IUBUtilities.reverseComplement(seq.getSeqString()), kmerSize); while (kmerGenerator.hasNext()) { kmer = kmerGenerator.next(); KmerAbund kmerAbund = kmerMaps[1].get(kmer); if (kmerAbund == null) { kmerAbund = new KmerAbund(); kmerMaps[1].put(kmer, kmerAbund); } kmerAbund.contigList.add( new ContigCoverage( contigIdx, seq.getSeqString().length() - kmerGenerator.getPosition() - kmerSize + 1)); } contigIdx++; } reader.close(); }
public void writeSeq(Sequence s) { writeSeq(s.getSeqName(), s.getDesc(), s.getSeqString()); }
/** * This program maps the kmers from reads to kmers on each contig, writes the mean, median * coverage of each contig to a file writes the kmer abundance to a file * * @param args * @throws IOException */ public static void main(String[] args) throws IOException, InterruptedException { int kmerSize = 45; final int maxThreads; final int maxTasks = 1000; final PrintStream match_reads_out; try { CommandLine cmdLine = new PosixParser().parse(options, args); args = cmdLine.getArgs(); if (args.length < 5) { throw new Exception("Unexpected number of arguments"); } kmerSize = Integer.parseInt(args[0]); if (kmerSize > Kmer.max_nucl_kmer_size) { throw new Exception("kmerSize should be less than " + Kmer.max_nucl_kmer_size); } if (cmdLine.hasOption("match_reads_out")) { match_reads_out = new PrintStream(cmdLine.getOptionValue("match_reads_out")); } else { match_reads_out = null; } if (cmdLine.hasOption("threads")) { maxThreads = Integer.valueOf(cmdLine.getOptionValue("threads")); if (maxThreads >= Runtime.getRuntime().availableProcessors()) { System.err.println( " Runtime.getRuntime().availableProcessors() " + Runtime.getRuntime().availableProcessors()); } } else { maxThreads = 1; } final KmerCoverage kmerCoverage = new KmerCoverage(kmerSize, new SequenceReader(new File(args[1]))); if (kmerCoverage.getTotalContigs() == 0) { System.out.println( "Found 0 contig with length >= kmer size " + kmerSize + " in input file " + args[1] + ". Exit program."); return; } final AtomicInteger outstandingTasks = new AtomicInteger(); ExecutorService service = Executors.newFixedThreadPool(maxThreads); Sequence seq; // parse one file at a time for (int index = 4; index < args.length; index++) { SequenceReader reader = new SequenceReader(new File(args[index])); while ((seq = reader.readNextSequence()) != null) { if (seq.getSeqString().length() < kmerSize) { continue; } final Sequence threadSeq = seq; Runnable r = new Runnable() { public void run() { try { kmerCoverage.processReads(threadSeq, match_reads_out); outstandingTasks.decrementAndGet(); } catch (Exception e) { e.printStackTrace(); } } }; outstandingTasks.incrementAndGet(); service.submit(r); while (outstandingTasks.get() >= maxTasks) ; } reader.close(); } service.shutdown(); service.awaitTermination(1, TimeUnit.DAYS); kmerCoverage.printCovereage( new FileOutputStream(new File(args[2])), new FileOutputStream(new File(args[3]))); if (match_reads_out != null) { match_reads_out.close(); } } catch (Exception e) { new HelpFormatter() .printHelp( "KmerCoverage <kmerSize> <query_file> <coverage_out> <abundance_out> <reads_file> <reads_file>...\nmaximum kmerSize " + Kmer.max_nucl_kmer_size, options); e.printStackTrace(); System.exit(1); } }
private synchronized void writeSeq(Sequence seq, PrintStream outStream) { outStream.println(">" + seq.getSeqName() + "\n" + seq.getSeqString()); }
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("USAGE: BloomFilterAppender <bloomfilter> <read_file>"); System.exit(1); } File bloomFilter = new File(args[0]); BloomFilter filter = BloomFilter.fromFile(bloomFilter); BloomFilter.GraphBuilder graphBuilder = filter.new GraphBuilder(); long seqCount = 0; args = Arrays.copyOfRange(args, 1, args.length); System.err.println("Starting to build bloom filter at " + new Date()); System.err.println("* reads file(s): " + Arrays.asList(args)); System.err.println("* bloom output: " + bloomFilter); System.err.println("* kmer size: " + filter.getKmerSize()); System.err.println("* hash size log2: " + filter.getHashSizeLog2()); System.err.println("* hash count: " + filter.getHashCount()); System.err.println("* bitset size log2: " + filter.getBitsetSize()); long startTime = System.currentTimeMillis(); for (String f : args) { File readFile = new File(f); SequenceReader reader = new SequenceReader(readFile); Sequence seq; while ((seq = reader.readNextSequence()) != null) { seqCount++; if ((seqCount % 1000000) == 0) { System.err.println("p: " + seqCount + " kmers added " + graphBuilder.getKmerAdded()); } graphBuilder.addString(seq.getSeqString().toCharArray()); } reader.close(); } BloomFilterStats.printStats(filter, System.out); long endTime = System.currentTimeMillis(); System.err.println( "time to build BloomFilter: " + (endTime - startTime) / 60000.0 + " minutes"); ObjectOutputStream oos = new ObjectOutputStream( new BufferedOutputStream( new FileOutputStream(bloomFilter.getAbsolutePath() + ".appended"))); oos.writeObject(filter); oos.close(); }