Ejemplo n.º 1
0
 public double tokenLog2Probability(String[] tokens, int start, int end) {
   int[] tokIds = new int[tokens.length];
   for (int i = 0; i < tokens.length; ++i) tokIds[i] = mSymbolTable.symbolToID(tokens[i]);
   double sum = 0.0;
   for (int i = start + 1; i <= end; ++i) sum += conditionalTokenEstimate(tokIds, start, i);
   return sum;
 }
Ejemplo n.º 2
0
  public double log2Estimate(char[] cs, int start, int end) {
    Strings.checkArgsStartEnd(cs, start, end);
    double logEstimate = 0.0;

    Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs, start, end - start);
    List<String> tokenList = new ArrayList<String>();
    while (true) {
      String whitespace = tokenizer.nextWhitespace();
      logEstimate += mWhitespaceModel.log2Estimate(whitespace);
      String token = tokenizer.nextToken();
      if (token == null) break;
      tokenList.add(token);
    }

    // collect token ids, estimate unknown tokens
    int[] tokIds = new int[tokenList.size() + 2];
    tokIds[0] = TokenizedLM.BOUNDARY_TOKEN;
    tokIds[tokIds.length - 1] = TokenizedLM.BOUNDARY_TOKEN;
    Iterator<String> it = tokenList.iterator();
    for (int i = 1; it.hasNext(); ++i) {
      String token = it.next();
      tokIds[i] = mSymbolTable.symbolToID(token);
      if (tokIds[i] < 0) {
        logEstimate += mUnknownTokenModel.log2Estimate(token);
      }
    }

    // estimate token ids
    for (int i = 2; i <= tokIds.length; ++i) {
      logEstimate += conditionalTokenEstimate(tokIds, 0, i);
    }
    return logEstimate;
  }