Beispiel #1
0
  /**
   * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and
   * Martin sect. 4.5.3.
   */
  public void smoothAndNormalize() {
    Counter<Integer> cntCounter = new Counter<Integer>();
    for (K tok : lm.keySet()) {
      int cnt = (int) lm.getCount(tok);
      cntCounter.incrementCount(cnt);
    }

    final double[] coeffs = runLogSpaceRegression(cntCounter);

    UNK_PROB = cntCounter.getCount(1) / lm.totalCount();

    for (K tok : lm.keySet()) {
      double tokCnt = lm.getCount(tok);
      if (tokCnt <= unkCutoff) // Treat as unknown
      unkTokens.add(tok);
      if (tokCnt <= kCutoff) { // Smooth
        double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs);
        lm.setCount(tok, cSmooth);
      }
    }

    // Normalize
    // Counters.normalize(lm);
    // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME
  }
Beispiel #2
0
 public Counter<String> getLogScoreCounter(LocalTrigramContext localTrigramContext) {
   int position = localTrigramContext.getPosition();
   String word = localTrigramContext.getWords().get(position);
   Counter<String> tagCounter = unknownWordTags;
   if (wordsToTags.keySet().contains(word)) {
     tagCounter = wordsToTags.getCounter(word);
   }
   Set<String> allowedFollowingTags =
       allowedFollowingTags(
           tagCounter.keySet(),
           localTrigramContext.getPreviousPreviousTag(),
           localTrigramContext.getPreviousTag());
   Counter<String> logScoreCounter = new Counter<String>();
   for (String tag : tagCounter.keySet()) {
     double logScore = Math.log(tagCounter.getCount(tag));
     if (!restrictTrigrams
         || allowedFollowingTags.isEmpty()
         || allowedFollowingTags.contains(tag)) logScoreCounter.setCount(tag, logScore);
   }
   return logScoreCounter;
 }