private void loadDfs(Path dfStatsPath) throws IOException { if (dfs != null) return; FSDataInputStream dfStatsInput = fileSys.open(dfStatsPath); int l = dfStatsInput.readInt(); if (l != prefixSet.size()) { throw new RuntimeException("df length mismatch: " + l + "\t" + prefixSet.size()); } dfs = new int[l]; for (int i = 0; i < l; i++) dfs[i] = WritableUtils.readVInt(dfStatsInput); dfStatsInput.close(); }
public void loadCfs(Path cfStatsPath) throws IOException { if (cfs != null) return; FSDataInputStream cfStatsInput = fileSys.open(cfStatsPath); int l = cfStatsInput.readInt(); if (l != prefixSet.size()) { throw new RuntimeException("cf length mismatch: " + l + "\t" + prefixSet.size()); } cfs = new long[l]; for (int i = 0; i < l; i++) cfs[i] = WritableUtils.readVLong(cfStatsInput); cfStatsInput.close(); }
public PrefixEncodedGlobalStatsWithIndex(Path prefixSetPath, FileSystem fs) throws IOException { fileSys = fs; FSDataInputStream termsInput = fileSys.open(prefixSetPath); prefixSet.readFields(termsInput); termsInput.close(); }
public PairOfIntLong getStats(String term) { int df = -1; long cf = -1; PairOfIntLong p = new PairOfIntLong(); if (frequentTermsDfs != null) { try { df = frequentTermsDfs.get(term); LOGGER.info("[cached] df of " + term + ": " + df); if (frequentTermsCfs != null) { try { cf = frequentTermsCfs.get(term); LOGGER.info("[cached] cf of " + term + ": " + cf); p.set(df, cf); return p; } catch (NoSuchElementException e) { } } } catch (NoSuchElementException e) { } } int index = prefixSet.getId(term); LOGGER.info("index of " + term + ": " + index); if (index < 0) return null; p.set(dfs[index], cfs[index]); return p; }
private void loadFrequentCfMap(int n) { if (frequentTermsCfs != null) return; frequentTermsCfs = new HMapKL<String>(); if (cfs.length < n) n = cfs.length; for (int id = 1; id <= n; id++) { frequentTermsCfs.put(prefixSet.getTerm(idToTerm[id - 1]), cfs[idToTerm[id - 1]]); } }
private void loadFrequentDfMap(int n) { if (frequentTermsDfs != null) return; frequentTermsDfs = new HMapKI<String>(); if (dfs.length < n) n = dfs.length; for (int id = 1; id <= n; id++) { frequentTermsDfs.put(prefixSet.getTerm(idToTerm[id - 1]), dfs[idToTerm[id - 1]]); } // return frequentTermsMap; }
public void printKeys() { System.out.println("Window: " + this.prefixSet.getWindowSize()); System.out.println("Length: " + this.length()); // int window = prefixSet.getWindow(); for (int i = 0; i < length() && i < 100; i++) { System.out.print(i + "\t" + prefixSet.getTerm(i)); if (dfs != null) System.out.print("\t" + dfs[i]); if (cfs != null) System.out.print("\t" + cfs[i]); System.out.println(); } }
public long getCF(String term) { // if(cfs == null) // throw new RuntimeException("CF-Stats must be loaded first!"); if (frequentTermsDfs != null) { try { long cf = frequentTermsCfs.get(term); LOGGER.info("[cached] df of " + term + ": " + cf); return cf; } catch (NoSuchElementException e) { } } int index = prefixSet.getId(term); LOGGER.info("index of " + term + ": " + index); if (index < 0) return -1; return cfs[index]; }
public int length() { return prefixSet.size(); }