public double tokenLog2Probability(String[] tokens, int start, int end) { int[] tokIds = new int[tokens.length]; for (int i = 0; i < tokens.length; ++i) tokIds[i] = mSymbolTable.symbolToID(tokens[i]); double sum = 0.0; for (int i = start + 1; i <= end; ++i) sum += conditionalTokenEstimate(tokIds, start, i); return sum; }
public double log2Estimate(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); double logEstimate = 0.0; Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs, start, end - start); List<String> tokenList = new ArrayList<String>(); while (true) { String whitespace = tokenizer.nextWhitespace(); logEstimate += mWhitespaceModel.log2Estimate(whitespace); String token = tokenizer.nextToken(); if (token == null) break; tokenList.add(token); } // collect token ids, estimate unknown tokens int[] tokIds = new int[tokenList.size() + 2]; tokIds[0] = TokenizedLM.BOUNDARY_TOKEN; tokIds[tokIds.length - 1] = TokenizedLM.BOUNDARY_TOKEN; Iterator<String> it = tokenList.iterator(); for (int i = 1; it.hasNext(); ++i) { String token = it.next(); tokIds[i] = mSymbolTable.symbolToID(token); if (tokIds[i] < 0) { logEstimate += mUnknownTokenModel.log2Estimate(token); } } // estimate token ids for (int i = 2; i <= tokIds.length; ++i) { logEstimate += conditionalTokenEstimate(tokIds, 0, i); } return logEstimate; }