/** * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and * Martin sect. 4.5.3. */ public void smoothAndNormalize() { Counter<Integer> cntCounter = new Counter<Integer>(); for (K tok : lm.keySet()) { int cnt = (int) lm.getCount(tok); cntCounter.incrementCount(cnt); } final double[] coeffs = runLogSpaceRegression(cntCounter); UNK_PROB = cntCounter.getCount(1) / lm.totalCount(); for (K tok : lm.keySet()) { double tokCnt = lm.getCount(tok); if (tokCnt <= unkCutoff) // Treat as unknown unkTokens.add(tok); if (tokCnt <= kCutoff) { // Smooth double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs); lm.setCount(tok, cSmooth); } } // Normalize // Counters.normalize(lm); // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME }
public static <E> Counter<E> normalize(Counter<E> counter) { Counter<E> normalizedCounter = new Counter<E>(); double total = counter.totalCount(); for (E key : counter.keySet()) { normalizedCounter.setCount(key, counter.getCount(key) / total); } return normalizedCounter; }
/** * @param <E> * @param x * @param y * @return */ public static <E> double jensenShannonDivergence(Counter<E> x, Counter<E> y) { double sum = 0.0; double xTotal = x.totalCount(); double yTotal = y.totalCount(); for (E key : x.keySet()) { // x -> x+y/2 double xVal = x.getCount(key) / xTotal; double yVal = y.getCount(key) / yTotal; double avg = 0.5 * (xVal + yVal); sum += xVal * Math.log(xVal / avg); } for (E key : y.keySet()) { // y -> x+y/2 double xVal = x.getCount(key) / xTotal; double yVal = y.getCount(key) / yTotal; double avg = 0.5 * (xVal + yVal); sum += yVal * Math.log(yVal / avg); } return sum / 0.5; }
public static <E> E sample(Counter<E> counter) { double total = counter.totalCount(); double rand = random.nextDouble(); double sum = 0.0; if (total <= 0.0) { throw new RuntimeException("Non-positive counter total: " + total); } for (E key : counter.keySet()) { double count = counter.getCount(key); if (count < 0.0) { throw new RuntimeException("Negative count in counter: " + key + " => " + count); } double prob = count / total; sum += prob; if (rand < sum) { return key; } } throw new RuntimeException("Shouldn't Reach Here"); }
public double totalMass() { return lm.totalCount(); }