示例#1
0
 @Override
 int read(final TextInput ti) throws IOException {
   int ch = ti.readByte();
   if (ch < 0x80) return ch;
   if (ch < 0xC0) return invalid();
   cache[0] = (byte) ch;
   final int cl = Token.cl((byte) ch);
   for (int c = 1; c < cl; ++c) {
     ch = ti.readByte();
     if (ch < 0x80) return invalid();
     cache[c] = (byte) ch;
   }
   return Token.cp(cache, 0);
 }
示例#2
0
  @Override
  protected byte[] stem(final byte[] word) {
    int ln = 0;
    final int wl = word.length;
    final char[] s = new char[wl];
    for (int i = 0; i < wl; i += Token.cl(word, i)) {
      s[ln++] = (char) Token.cp(word, i);
    }
    if (ln < 4) return word;

    final int olen = ln;
    // "short rules": if it hits one of these, it skips the "long list"
    int l = rule0(s, ln);
    l = rule1(s, l);
    l = rule2(s, l);
    l = rule3(s, l);
    l = rule4(s, l);
    l = rule5(s, l);
    l = rule6(s, l);
    l = rule7(s, l);
    l = rule8(s, l);
    l = rule9(s, l);
    l = rule10(s, l);
    l = rule11(s, l);
    l = rule12(s, l);
    l = rule13(s, l);
    l = rule14(s, l);
    l = rule15(s, l);
    l = rule16(s, l);
    l = rule17(s, l);
    l = rule18(s, l);
    l = rule19(s, l);
    l = rule20(s, l);
    if (l == olen) l = rule21(s, l);
    // "long list"
    l = rule22(s, l);

    final TokenBuilder tb = new TokenBuilder(l << 1);
    for (int i = 0; i < l; i++) tb.add(s[i]);
    return tb.finish();
  }
示例#3
0
 /**
  * Returns the length of the codepoints stored at the specified position.
  *
  * @param pos position
  * @return character
  */
 public int cl(final int pos) {
   return Token.cl(chars, pos);
 }