public double log2Estimate(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); double logEstimate = 0.0; Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs, start, end - start); List<String> tokenList = new ArrayList<String>(); while (true) { String whitespace = tokenizer.nextWhitespace(); logEstimate += mWhitespaceModel.log2Estimate(whitespace); String token = tokenizer.nextToken(); if (token == null) break; tokenList.add(token); } // collect token ids, estimate unknown tokens int[] tokIds = new int[tokenList.size() + 2]; tokIds[0] = TokenizedLM.BOUNDARY_TOKEN; tokIds[tokIds.length - 1] = TokenizedLM.BOUNDARY_TOKEN; Iterator<String> it = tokenList.iterator(); for (int i = 1; it.hasNext(); ++i) { String token = it.next(); tokIds[i] = mSymbolTable.symbolToID(token); if (tokIds[i] < 0) { logEstimate += mUnknownTokenModel.log2Estimate(token); } } // estimate token ids for (int i = 2; i <= tokIds.length; ++i) { logEstimate += conditionalTokenEstimate(tokIds, 0, i); } return logEstimate; }
/** * Increments by the specified count all substrings of the specified character array slice up to * the maximum length specified in the constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @param count Amount to increment. * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the * bounds of character sequence. */ public void incrementSubstrings(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs, start, end); // increment maximal strings and prefixes for (int i = start; i + mMaxLength <= end; ++i) incrementPrefixes(cs, i, i + mMaxLength, count); // increment short final strings and prefixes for (int i = Math.max(start, end - mMaxLength + 1); i < end; ++i) incrementPrefixes(cs, i, end, count); }
public int numCharactersFollowing(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); return mRootNode.numOutcomes(cs, start, end); }
public char[] charactersFollowing(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); return com.aliasi.util.Arrays.copy(mRootNode.outcomes(cs, start, end)); }
public long extensionCount(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); return mRootNode.contextCount(cs, start, end); }
/** * Decrements all of the substrings of the specified character slice by one. This method may be * used in conjunction with {@link #incrementSubstrings(char[],int,int)} to implement counts for * conditional probability estimates without affecting underlying estimates. For example, the * following code: * * <blockquote> * * <pre> * char[] cs = "abcdefghi".toCharArray(); * counter.incrementSubstrings(cs,3,7); * counter.decrementSubstrings(cs,3,5); * </pre> * * </blockquote> * * will increment the substrings of <code>"defg"</code> and then decrement the * substrings of <code>"de"</code>, causing the net effect of incrementing the counts of * substrings <code>"defg"</code>, <code>"efg"</code>, <code>"fg" * </code>, <code>"g"</code>, <code>"def"</code>, <code>"ef"</code>, * and <code>"f"</code>. This has the effect of increasing the estimate of <code>g * </code> given <code>def</code>, without increasing the estimate of <code>d</code> in an empty * context. * * @param cs Underlying array of characters in slice. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @throws IllegalArgumentException If the array slice is valid. */ public void decrementSubstrings(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); for (int i = start; i < end; ++i) for (int j = i; j <= end; ++j) mRootNode = mRootNode.decrement(cs, i, j); }
/** * Increments the count of all prefixes of the specified character sequence up to the maximum * length specified in the constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @param count Amount to increment. * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the * bounds of character sequence. */ public void incrementPrefixes(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs, start, end); mRootNode = mRootNode.increment(cs, start, end, count); }