/** * Determines the dimensions of the merged matrix, and creates a RandomAccessFile on disk large * enough to hold the data. */ public static final void createMergeOutfile(String[] infiles, String outfile) { TagCountMutable theTCM = new TagCountMutable( 2, maxTags); // 50,000,000 works on a Windows7 machine with 4GB of RAM - JCG. myLogger.info("Mutable Tag Count Created. MaxTags:" + theTCM.getSize()); myLogger.info("CurrentSize:" + theTCM.getCurrentSize()); TreeSet<String> allTaxa = new TreeSet<>(); int filesRead = 0; for (String inName : infiles) { filesRead++; myLogger.info("Scanning " + inName + " (file " + filesRead + " of " + infiles.length + ")."); TagsByTaxa tbtFM = newTBT(inName); if (combineSynonymousTaxa) { tbtFM.truncateTaxonNames(); } theTCM.addReadCounts(tbtFM, 1); for (String name : tbtFM.getTaxaNames()) { allTaxa.add(name); } myLogger.info("CurrentSize:" + theTCM.getCurrentSize()); myLogger.info("Current Taxa:" + allTaxa.size()); theTCM.collapseCounts(); myLogger.info("CurrentSize:" + theTCM.getCurrentSize()); myLogger.info("Size:" + theTCM.getSize()); } theTCM.shrinkToCurrentRows(); myLogger.info("Size:" + theTCM.getSize()); String[] tn = allTaxa.toArray(new String[0]); TagsByTaxa tbtOut = newTBT(outfile, tn, theTCM); }
// This constructor works for tbt file public void getClusters(TagsByTaxa tbt) { ArrayList<cluster> clList = new ArrayList(); TagMatchFinder tmf = new TagMatchFinder(tbt); for (int i = 0; i < tbt.getTagCount(); i++) { long[] qTag = tbt.getTag(i); TreeMap<Integer, Integer> hitDiv = tmf.findMatchesWithIntLengthWords(qTag, 1, false); for (Entry<Integer, Integer> each : hitDiv.entrySet()) { if (each.getValue() > 0) { clList.add(new cluster(i, each.getKey(), hitDiv.size(), true)); } } } cls = clList.toArray(new cluster[clList.size()]); // Arrays.sort(cls); }
/** Inserts tag count values into the RandomAccessFile created by @link{createMergeOutfile}. */ public static final void fillMergeOutfile(String[] infiles, String outfile) { TagsByTaxa tbtOut = newTBT(outfile); tbtOut.setMethodByRows(true); int count = 0; int filesRead = 0; for (String inName : infiles) { filesRead++; myLogger.info("Scanning " + inName + " (file " + filesRead + " of " + infiles.length + ")."); TagsByTaxa tbtFM = newTBT(inName); if (combineSynonymousTaxa) { tbtFM.truncateTaxonNames(); } int[] theTR = taxaRedirect(tbtFM.getTaxaNames(), tbtOut.getTaxaNames()); for (int i = 0; i < tbtFM.getTagCount(); i++) { int toTag = tbtOut.getTagIndex(tbtFM.getTag(i)); if (toTag < 0) { continue; } for (int t = 0; t < tbtFM.getTaxaCount(); t++) { if (theTR[t] < 0) { continue; } int tagCount = tbtOut.getReadCountForTagTaxon(toTag, theTR[t]) + tbtFM.getReadCountForTagTaxon(i, t); if (tagCount > 0) { tbtOut.setReadCountForTagTaxon(toTag, theTR[t], tagCount); count++; } } if (count % 100000 == 0) { System.out.printf("Tag:%d BitSet:%d %n", i, count); } } } tbtOut.getFileReadyForClosing(); }