예제 #1
0
  public double log2Estimate(char[] cs, int start, int end) {
    Strings.checkArgsStartEnd(cs, start, end);
    double logEstimate = 0.0;

    Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs, start, end - start);
    List<String> tokenList = new ArrayList<String>();
    while (true) {
      String whitespace = tokenizer.nextWhitespace();
      logEstimate += mWhitespaceModel.log2Estimate(whitespace);
      String token = tokenizer.nextToken();
      if (token == null) break;
      tokenList.add(token);
    }

    // collect token ids, estimate unknown tokens
    int[] tokIds = new int[tokenList.size() + 2];
    tokIds[0] = TokenizedLM.BOUNDARY_TOKEN;
    tokIds[tokIds.length - 1] = TokenizedLM.BOUNDARY_TOKEN;
    Iterator<String> it = tokenList.iterator();
    for (int i = 1; it.hasNext(); ++i) {
      String token = it.next();
      tokIds[i] = mSymbolTable.symbolToID(token);
      if (tokIds[i] < 0) {
        logEstimate += mUnknownTokenModel.log2Estimate(token);
      }
    }

    // estimate token ids
    for (int i = 2; i <= tokIds.length; ++i) {
      logEstimate += conditionalTokenEstimate(tokIds, 0, i);
    }
    return logEstimate;
  }
 /**
  * Increments by the specified count all substrings of the specified character array slice up to
  * the maximum length specified in the constructor.
  *
  * @param cs Underlying character array.
  * @param start Index of first character in slice.
  * @param end Index of one past last character in slice.
  * @param count Amount to increment.
  * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the
  *     bounds of character sequence.
  */
 public void incrementSubstrings(char[] cs, int start, int end, int count) {
   Strings.checkArgsStartEnd(cs, start, end);
   // increment maximal strings and prefixes
   for (int i = start; i + mMaxLength <= end; ++i) incrementPrefixes(cs, i, i + mMaxLength, count);
   // increment short final strings and prefixes
   for (int i = Math.max(start, end - mMaxLength + 1); i < end; ++i)
     incrementPrefixes(cs, i, end, count);
 }
 public int numCharactersFollowing(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   return mRootNode.numOutcomes(cs, start, end);
 }
 public char[] charactersFollowing(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   return com.aliasi.util.Arrays.copy(mRootNode.outcomes(cs, start, end));
 }
 public long extensionCount(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   return mRootNode.contextCount(cs, start, end);
 }
 /**
  * Decrements all of the substrings of the specified character slice by one. This method may be
  * used in conjunction with {@link #incrementSubstrings(char[],int,int)} to implement counts for
  * conditional probability estimates without affecting underlying estimates. For example, the
  * following code:
  *
  * <blockquote>
  *
  * <pre>
  * char[] cs = &quot;abcdefghi&quot;.toCharArray();
  * counter.incrementSubstrings(cs,3,7);
  * counter.decrementSubstrings(cs,3,5);
  * </pre>
  *
  * </blockquote>
  *
  * will increment the substrings of <code>&quot;defg&quot;</code> and then decrement the
  * substrings of <code>&quot;de&quot;</code>, causing the net effect of incrementing the counts of
  * substrings <code>&quot;defg&quot;</code>, <code>&quot;efg&quot;</code>, <code>&quot;fg&quot;
  * </code>, <code>&quot;g&quot;</code>, <code>&quot;def&quot;</code>, <code>&quot;ef&quot;</code>,
  * and <code>&quot;f&quot;</code>. This has the effect of increasing the estimate of <code>g
  * </code> given <code>def</code>, without increasing the estimate of <code>d</code> in an empty
  * context.
  *
  * @param cs Underlying array of characters in slice.
  * @param start Index of first character in slice.
  * @param end Index of one past last character in slice.
  * @throws IllegalArgumentException If the array slice is valid.
  */
 public void decrementSubstrings(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   for (int i = start; i < end; ++i)
     for (int j = i; j <= end; ++j) mRootNode = mRootNode.decrement(cs, i, j);
 }
 /**
  * Increments the count of all prefixes of the specified character sequence up to the maximum
  * length specified in the constructor.
  *
  * @param cs Underlying character array.
  * @param start Index of first character in slice.
  * @param end Index of one past last character in slice.
  * @param count Amount to increment.
  * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the
  *     bounds of character sequence.
  */
 public void incrementPrefixes(char[] cs, int start, int end, int count) {
   Strings.checkArgsStartEnd(cs, start, end);
   mRootNode = mRootNode.increment(cs, start, end, count);
 }