Example #1
1
 /**
  * Given a string, return an array of tokens. The separator can be escaped with the '\' character.
  * The '\' character may also be escaped by the '\' character.
  *
  * @param s the string to tokenize.
  * @param separator the separator char.
  * @param maxTokens the maxmimum number of tokens returned. If the max is reached, the remaining
  *     part of s is appended to the end of the last token.
  * @return an array of tokens.
  */
 public static String[] tokenize(String s, char separator, int maxTokens) {
   List tokens = new ArrayList();
   StringBuilder token = new StringBuilder();
   boolean prevIsEscapeChar = false;
   for (int i = 0; i < s.length(); i += Character.charCount(i)) {
     int currentChar = s.codePointAt(i);
     if (prevIsEscapeChar) {
       // Case 1:  escaped character
       token.appendCodePoint(currentChar);
       prevIsEscapeChar = false;
     } else if (currentChar == separator && tokens.size() < maxTokens - 1) {
       // Case 2:  separator
       tokens.add(token.toString());
       token = new StringBuilder();
     } else if (currentChar == '\\') {
       // Case 3:  escape character
       prevIsEscapeChar = true;
     } else {
       // Case 4:  regular character
       token.appendCodePoint(currentChar);
     }
   }
   if (token.length() > 0) {
     tokens.add(token.toString());
   }
   return (String[]) tokens.toArray(new String[] {});
 }
Example #2
0
  /** puts as utf-8 string */
  protected int _put(String str) {

    final int len = str.length();
    int total = 0;

    for (int i = 0; i < len; ) {
      int c = Character.codePointAt(str, i);

      if (c < 0x80) {
        _buf.write((byte) c);
        total += 1;
      } else if (c < 0x800) {
        _buf.write((byte) (0xc0 + (c >> 6)));
        _buf.write((byte) (0x80 + (c & 0x3f)));
        total += 2;
      } else if (c < 0x10000) {
        _buf.write((byte) (0xe0 + (c >> 12)));
        _buf.write((byte) (0x80 + ((c >> 6) & 0x3f)));
        _buf.write((byte) (0x80 + (c & 0x3f)));
        total += 3;
      } else {
        _buf.write((byte) (0xf0 + (c >> 18)));
        _buf.write((byte) (0x80 + ((c >> 12) & 0x3f)));
        _buf.write((byte) (0x80 + ((c >> 6) & 0x3f)));
        _buf.write((byte) (0x80 + (c & 0x3f)));
        total += 4;
      }

      i += Character.charCount(c);
    }

    _buf.write((byte) 0);
    total++;
    return total;
  }
Example #3
0
  /**
   * Logically casts input to UTF32 ints then looks up the output or null if the input is not
   * accepted. FST must be INPUT_TYPE.BYTE4.
   */
  public static <T> T get(FST<T> fst, CharSequence input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE4;

    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

    int charIdx = 0;
    final int charLimit = input.length();

    // Accumulate output as we go
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    T output = NO_OUTPUT;

    while (charIdx < charLimit) {
      final int utf32 = Character.codePointAt(input, charIdx);
      charIdx += Character.charCount(utf32);

      if (fst.findTargetArc(utf32, arc, arc) == null) {
        return null;
      } else if (arc.output != NO_OUTPUT) {
        output = fst.outputs.add(output, arc.output);
      }
    }

    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
      return null;
    } else if (arc.output != NO_OUTPUT) {
      return fst.outputs.add(output, arc.output);
    } else {
      return output;
    }
  }