Пример #1
0
  /**
   * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and
   * Martin sect. 4.5.3.
   */
  public void smoothAndNormalize() {
    Counter<Integer> cntCounter = new Counter<Integer>();
    for (K tok : lm.keySet()) {
      int cnt = (int) lm.getCount(tok);
      cntCounter.incrementCount(cnt);
    }

    final double[] coeffs = runLogSpaceRegression(cntCounter);

    UNK_PROB = cntCounter.getCount(1) / lm.totalCount();

    for (K tok : lm.keySet()) {
      double tokCnt = lm.getCount(tok);
      if (tokCnt <= unkCutoff) // Treat as unknown
      unkTokens.add(tok);
      if (tokCnt <= kCutoff) { // Smooth
        double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs);
        lm.setCount(tok, cSmooth);
      }
    }

    // Normalize
    // Counters.normalize(lm);
    // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME
  }
Пример #2
0
 public double getCount(K token) {
   if (!lm.keySet().contains(token)) {
     System.err.println(lm.keySet().size());
     throw new RuntimeException("token not in keyset");
   }
   return lm.getCount(token);
 }
Пример #3
0
  private double[] runLogSpaceRegression(Counter<Integer> cntCounter) {
    SimpleRegression reg = new SimpleRegression();

    for (int cnt : cntCounter.keySet()) {
      reg.addData(cnt, Math.log(cntCounter.getCount(cnt)));
    }

    // System.out.println(reg.getIntercept());
    // System.out.println(reg.getSlope());
    // System.out.println(regression.getSlopeStdErr());

    double[] coeffs = new double[] {reg.getIntercept(), reg.getSlope()};

    return coeffs;
  }
Пример #4
0
 public Set<K> getVocab() {
   return Collections.unmodifiableSet(lm.keySet());
 }
Пример #5
0
 public int vocabSize() {
   return lm.keySet().size();
 }