/** * find the kmers in the contigs * * @param reader * @throws IOException */ private void processContigFile(SequenceReader reader) throws IOException { Sequence seq; NuclKmerGenerator kmerGenerator; Kmer kmer; int contigIdx = 0; while ((seq = reader.readNextSequence()) != null) { if (seq.getSeqString().length() < kmerSize) { continue; } // use int to represent seqname in case contig names are too long contigMap.put( contigIdx, new Contig(seq.getSeqName(), seq.getSeqString().length() - kmerSize + 1)); // forward direction kmerGenerator = new NuclKmerGenerator(seq.getSeqString(), kmerSize); while (kmerGenerator.hasNext()) { kmer = kmerGenerator.next(); KmerAbund kmerAbund = kmerMaps[0].get(kmer); if (kmerAbund == null) { kmerAbund = new KmerAbund(); kmerMaps[0].put(kmer, kmerAbund); } kmerAbund.contigList.add(new ContigCoverage(contigIdx, kmerGenerator.getPosition() - 1)); } // reverse direction kmerGenerator = new NuclKmerGenerator(IUBUtilities.reverseComplement(seq.getSeqString()), kmerSize); while (kmerGenerator.hasNext()) { kmer = kmerGenerator.next(); KmerAbund kmerAbund = kmerMaps[1].get(kmer); if (kmerAbund == null) { kmerAbund = new KmerAbund(); kmerMaps[1].put(kmer, kmerAbund); } kmerAbund.contigList.add( new ContigCoverage( contigIdx, seq.getSeqString().length() - kmerGenerator.getPosition() - kmerSize + 1)); } contigIdx++; } reader.close(); }
public void printCovereage(OutputStream coverage_out, OutputStream abundance_out) throws IOException { adjustCount(); // print out the weighted kmer coverage // we found mean coverage matched the previous biological observation PrintStream coverage_outStream = new PrintStream(coverage_out); coverage_outStream.println("#total reads: " + totalReads.intValue()); coverage_outStream.println("#use mean_cov to adjust the contig abundance, not median_cov "); coverage_outStream.println( "#seqid\tmean_cov\tmedian_cov\ttotal_pos\tcovered_pos\tcovered_ratio"); for (Contig contig : contigMap.values()) { ArrayList<Double> counts = new ArrayList<Double>(); int coveredPos = 0; for (int pos = 0; pos < contig.coverage.length; pos++) { if (contig.coverage[pos] > 0) { coveredPos++; } counts.add(contig.coverage[pos]); } if (coveredPos > 0) { coverage_outStream.println( contig.name + "\t" + String.format(dformat, StdevCal.calMean(counts)) + "\t" + String.format(dformat, (StdevCal.calMedian(counts))) + "\t" + counts.size() + "\t" + coveredPos + "\t" + String.format(dformat, (double) coveredPos / (double) contig.coverage.length)); } else { // no coverage coverage_outStream.println( contig.name + "\t" + 0 + "\t" + 0 + "\t" + contig.coverage.length + "\t" + 0 + "\t" + 0); } } coverage_outStream.close(); // print kmer abundance HashMap<Integer, Integer> abundanceCountMap = new HashMap<Integer, Integer>(); // the frequeny of the kmer abundance PrintStream abundance_outStream = new PrintStream(abundance_out); // need to merge the counts from forward and reverse together. HashSet<Kmer> kmerSet = new HashSet<Kmer>(); kmerSet.addAll(kmerMaps[0].keySet()); for (Kmer kmer : kmerSet) { AtomicInteger abundance = kmerMaps[0].get(kmer).count; String reverseKmerStr = IUBUtilities.reverseComplement(kmer.decodeLong(kmer.getLongKmers())); Kmer reverseKmer = (new NuclKmerGenerator(reverseKmerStr, this.kmerSize)).next(); KmerAbund kmerAbund = kmerMaps[1].get(reverseKmer); if (kmerAbund != null) { abundance.addAndGet(kmerAbund.count.get()); } Integer count = abundanceCountMap.get(abundance.get()); if (count == null) { abundanceCountMap.put(abundance.get(), 1); } else { abundanceCountMap.put(abundance.get(), count + 1); } } abundance_outStream.println("kmer_abundance\tfrequency"); for (Integer abundance : abundanceCountMap.keySet()) { abundance_outStream.println(abundance + "\t" + abundanceCountMap.get(abundance)); } abundance_outStream.close(); }