/** * Given a string, return an array of tokens. The separator can be escaped with the '\' character. * The '\' character may also be escaped by the '\' character. * * @param s the string to tokenize. * @param separator the separator char. * @param maxTokens the maxmimum number of tokens returned. If the max is reached, the remaining * part of s is appended to the end of the last token. * @return an array of tokens. */ public static String[] tokenize(String s, char separator, int maxTokens) { List tokens = new ArrayList(); StringBuilder token = new StringBuilder(); boolean prevIsEscapeChar = false; for (int i = 0; i < s.length(); i += Character.charCount(i)) { int currentChar = s.codePointAt(i); if (prevIsEscapeChar) { // Case 1: escaped character token.appendCodePoint(currentChar); prevIsEscapeChar = false; } else if (currentChar == separator && tokens.size() < maxTokens - 1) { // Case 2: separator tokens.add(token.toString()); token = new StringBuilder(); } else if (currentChar == '\\') { // Case 3: escape character prevIsEscapeChar = true; } else { // Case 4: regular character token.appendCodePoint(currentChar); } } if (token.length() > 0) { tokens.add(token.toString()); } return (String[]) tokens.toArray(new String[] {}); }
/** * Logically casts input to UTF32 ints then looks up the output or null if the input is not * accepted. FST must be INPUT_TYPE.BYTE4. */ public static <T> T get(FST<T> fst, CharSequence input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE4; // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); int charIdx = 0; final int charLimit = input.length(); // Accumulate output as we go final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; while (charIdx < charLimit) { final int utf32 = Character.codePointAt(input, charIdx); charIdx += Character.charCount(utf32); if (fst.findTargetArc(utf32, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } }
/** * This method ensures that the output String has only valid XML unicode characters as specified * by the XML 1.0 standard. For reference, please see the standard. This method will return an * empty String if the input is null or empty. * * @author Donoiu Cristian, GPL The String whose non-valid characters we want to remove. in * String, stripped of non-valid characters. */ public static String removeInvalidXMLCharacters(String s) { StringBuilder out = new StringBuilder(); // Used to hold the output. int codePoint; // Used to reference the current character. // String ss = "\ud801\udc00"; // This is actualy one unicode // character, represented by two code units!!!. // System.out.println(ss.codePointCount(0, ss.length()));// See: 1 int i = 0; while (i < s.length()) { // System.out.println("i=" + i); codePoint = s.codePointAt(i); // This is the unicode code of the character. if ((codePoint == 0x9) || // Consider testing larger ranges first to improve speed. (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) { out.append(Character.toChars(codePoint)); } i += Character.charCount( codePoint); // Increment with the number of code units(java chars) needed to represent // a Unicode char. } return out.toString(); }