예제 #1
1
 /**
  * Given a string, return an array of tokens. The separator can be escaped with the '\' character.
  * The '\' character may also be escaped by the '\' character.
  *
  * @param s the string to tokenize.
  * @param separator the separator char.
  * @param maxTokens the maxmimum number of tokens returned. If the max is reached, the remaining
  *     part of s is appended to the end of the last token.
  * @return an array of tokens.
  */
 public static String[] tokenize(String s, char separator, int maxTokens) {
   List tokens = new ArrayList();
   StringBuilder token = new StringBuilder();
   boolean prevIsEscapeChar = false;
   for (int i = 0; i < s.length(); i += Character.charCount(i)) {
     int currentChar = s.codePointAt(i);
     if (prevIsEscapeChar) {
       // Case 1:  escaped character
       token.appendCodePoint(currentChar);
       prevIsEscapeChar = false;
     } else if (currentChar == separator && tokens.size() < maxTokens - 1) {
       // Case 2:  separator
       tokens.add(token.toString());
       token = new StringBuilder();
     } else if (currentChar == '\\') {
       // Case 3:  escape character
       prevIsEscapeChar = true;
     } else {
       // Case 4:  regular character
       token.appendCodePoint(currentChar);
     }
   }
   if (token.length() > 0) {
     tokens.add(token.toString());
   }
   return (String[]) tokens.toArray(new String[] {});
 }
예제 #2
0
  /**
   * Logically casts input to UTF32 ints then looks up the output or null if the input is not
   * accepted. FST must be INPUT_TYPE.BYTE4.
   */
  public static <T> T get(FST<T> fst, CharSequence input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE4;

    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

    int charIdx = 0;
    final int charLimit = input.length();

    // Accumulate output as we go
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    T output = NO_OUTPUT;

    while (charIdx < charLimit) {
      final int utf32 = Character.codePointAt(input, charIdx);
      charIdx += Character.charCount(utf32);

      if (fst.findTargetArc(utf32, arc, arc) == null) {
        return null;
      } else if (arc.output != NO_OUTPUT) {
        output = fst.outputs.add(output, arc.output);
      }
    }

    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
      return null;
    } else if (arc.output != NO_OUTPUT) {
      return fst.outputs.add(output, arc.output);
    } else {
      return output;
    }
  }
예제 #3
0
파일: Utils.java 프로젝트: mkolod/pdfxtk
 /**
  * This method ensures that the output String has only valid XML unicode characters as specified
  * by the XML 1.0 standard. For reference, please see the standard. This method will return an
  * empty String if the input is null or empty.
  *
  * @author Donoiu Cristian, GPL The String whose non-valid characters we want to remove. in
  *     String, stripped of non-valid characters.
  */
 public static String removeInvalidXMLCharacters(String s) {
   StringBuilder out = new StringBuilder(); // Used to hold the output.
   int codePoint; // Used to reference the current character.
   // String ss = "\ud801\udc00";                           // This is actualy one unicode
   // character, represented by two code units!!!.
   // System.out.println(ss.codePointCount(0, ss.length()));// See: 1
   int i = 0;
   while (i < s.length()) {
     // System.out.println("i=" + i);
     codePoint = s.codePointAt(i); // This is the unicode code of the character.
     if ((codePoint == 0x9)
         || // Consider testing larger ranges first to improve speed.
         (codePoint == 0xA)
         || (codePoint == 0xD)
         || ((codePoint >= 0x20) && (codePoint <= 0xD7FF))
         || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD))
         || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) {
       out.append(Character.toChars(codePoint));
     }
     i +=
         Character.charCount(
             codePoint); // Increment with the number of code units(java chars) needed to represent
                         // a Unicode char.
   }
   return out.toString();
 }