/** * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and * Martin sect. 4.5.3. */ public void smoothAndNormalize() { Counter<Integer> cntCounter = new Counter<Integer>(); for (K tok : lm.keySet()) { int cnt = (int) lm.getCount(tok); cntCounter.incrementCount(cnt); } final double[] coeffs = runLogSpaceRegression(cntCounter); UNK_PROB = cntCounter.getCount(1) / lm.totalCount(); for (K tok : lm.keySet()) { double tokCnt = lm.getCount(tok); if (tokCnt <= unkCutoff) // Treat as unknown unkTokens.add(tok); if (tokCnt <= kCutoff) { // Smooth double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs); lm.setCount(tok, cSmooth); } } // Normalize // Counters.normalize(lm); // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME }
public double getCount(K token) { if (!lm.keySet().contains(token)) { System.err.println(lm.keySet().size()); throw new RuntimeException("token not in keyset"); } return lm.getCount(token); }
private double[] runLogSpaceRegression(Counter<Integer> cntCounter) { SimpleRegression reg = new SimpleRegression(); for (int cnt : cntCounter.keySet()) { reg.addData(cnt, Math.log(cntCounter.getCount(cnt))); } // System.out.println(reg.getIntercept()); // System.out.println(reg.getSlope()); // System.out.println(regression.getSlopeStdErr()); double[] coeffs = new double[] {reg.getIntercept(), reg.getSlope()}; return coeffs; }
public Set<K> getVocab() { return Collections.unmodifiableSet(lm.keySet()); }
public int vocabSize() { return lm.keySet().size(); }