@Override int read(final TextInput ti) throws IOException { int ch = ti.readByte(); if (ch < 0x80) return ch; if (ch < 0xC0) return invalid(); cache[0] = (byte) ch; final int cl = Token.cl((byte) ch); for (int c = 1; c < cl; ++c) { ch = ti.readByte(); if (ch < 0x80) return invalid(); cache[c] = (byte) ch; } return Token.cp(cache, 0); }
@Override protected byte[] stem(final byte[] word) { int ln = 0; final int wl = word.length; final char[] s = new char[wl]; for (int i = 0; i < wl; i += Token.cl(word, i)) { s[ln++] = (char) Token.cp(word, i); } if (ln < 4) return word; final int olen = ln; // "short rules": if it hits one of these, it skips the "long list" int l = rule0(s, ln); l = rule1(s, l); l = rule2(s, l); l = rule3(s, l); l = rule4(s, l); l = rule5(s, l); l = rule6(s, l); l = rule7(s, l); l = rule8(s, l); l = rule9(s, l); l = rule10(s, l); l = rule11(s, l); l = rule12(s, l); l = rule13(s, l); l = rule14(s, l); l = rule15(s, l); l = rule16(s, l); l = rule17(s, l); l = rule18(s, l); l = rule19(s, l); l = rule20(s, l); if (l == olen) l = rule21(s, l); // "long list" l = rule22(s, l); final TokenBuilder tb = new TokenBuilder(l << 1); for (int i = 0; i < l; i++) tb.add(s[i]); return tb.finish(); }
/** * Returns the length of the codepoints stored at the specified position. * * @param pos position * @return character */ public int cl(final int pos) { return Token.cl(chars, pos); }