/** * Given a string, return an array of tokens. The separator can be escaped with the '\' character. * The '\' character may also be escaped by the '\' character. * * @param s the string to tokenize. * @param separator the separator char. * @param maxTokens the maxmimum number of tokens returned. If the max is reached, the remaining * part of s is appended to the end of the last token. * @return an array of tokens. */ public static String[] tokenize(String s, char separator, int maxTokens) { List tokens = new ArrayList(); StringBuilder token = new StringBuilder(); boolean prevIsEscapeChar = false; for (int i = 0; i < s.length(); i += Character.charCount(i)) { int currentChar = s.codePointAt(i); if (prevIsEscapeChar) { // Case 1: escaped character token.appendCodePoint(currentChar); prevIsEscapeChar = false; } else if (currentChar == separator && tokens.size() < maxTokens - 1) { // Case 2: separator tokens.add(token.toString()); token = new StringBuilder(); } else if (currentChar == '\\') { // Case 3: escape character prevIsEscapeChar = true; } else { // Case 4: regular character token.appendCodePoint(currentChar); } } if (token.length() > 0) { tokens.add(token.toString()); } return (String[]) tokens.toArray(new String[] {}); }
/** puts as utf-8 string */ protected int _put(String str) { final int len = str.length(); int total = 0; for (int i = 0; i < len; ) { int c = Character.codePointAt(str, i); if (c < 0x80) { _buf.write((byte) c); total += 1; } else if (c < 0x800) { _buf.write((byte) (0xc0 + (c >> 6))); _buf.write((byte) (0x80 + (c & 0x3f))); total += 2; } else if (c < 0x10000) { _buf.write((byte) (0xe0 + (c >> 12))); _buf.write((byte) (0x80 + ((c >> 6) & 0x3f))); _buf.write((byte) (0x80 + (c & 0x3f))); total += 3; } else { _buf.write((byte) (0xf0 + (c >> 18))); _buf.write((byte) (0x80 + ((c >> 12) & 0x3f))); _buf.write((byte) (0x80 + ((c >> 6) & 0x3f))); _buf.write((byte) (0x80 + (c & 0x3f))); total += 4; } i += Character.charCount(c); } _buf.write((byte) 0); total++; return total; }
/** * Logically casts input to UTF32 ints then looks up the output or null if the input is not * accepted. FST must be INPUT_TYPE.BYTE4. */ public static <T> T get(FST<T> fst, CharSequence input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE4; // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); int charIdx = 0; final int charLimit = input.length(); // Accumulate output as we go final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; while (charIdx < charLimit) { final int utf32 = Character.codePointAt(input, charIdx); charIdx += Character.charCount(utf32); if (fst.findTargetArc(utf32, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } }