/** * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and * Martin sect. 4.5.3. */ public void smoothAndNormalize() { Counter<Integer> cntCounter = new Counter<Integer>(); for (K tok : lm.keySet()) { int cnt = (int) lm.getCount(tok); cntCounter.incrementCount(cnt); } final double[] coeffs = runLogSpaceRegression(cntCounter); UNK_PROB = cntCounter.getCount(1) / lm.totalCount(); for (K tok : lm.keySet()) { double tokCnt = lm.getCount(tok); if (tokCnt <= unkCutoff) // Treat as unknown unkTokens.add(tok); if (tokCnt <= kCutoff) { // Smooth double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs); lm.setCount(tok, cSmooth); } } // Normalize // Counters.normalize(lm); // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME }
public Counter<String> getLogScoreCounter(LocalTrigramContext localTrigramContext) { int position = localTrigramContext.getPosition(); String word = localTrigramContext.getWords().get(position); Counter<String> tagCounter = unknownWordTags; if (wordsToTags.keySet().contains(word)) { tagCounter = wordsToTags.getCounter(word); } Set<String> allowedFollowingTags = allowedFollowingTags( tagCounter.keySet(), localTrigramContext.getPreviousPreviousTag(), localTrigramContext.getPreviousTag()); Counter<String> logScoreCounter = new Counter<String>(); for (String tag : tagCounter.keySet()) { double logScore = Math.log(tagCounter.getCount(tag)); if (!restrictTrigrams || allowedFollowingTags.isEmpty() || allowedFollowingTags.contains(tag)) logScoreCounter.setCount(tag, logScore); } return logScoreCounter; }