Ejemplo n.º 1
0
 /**
  * Skip over a sequence of zero or more white space characters at pos. Return the index of the
  * first non-white-space character at or after pos, or str.length(), if there is none.
  */
 public static int skipWhitespace(String str, int pos) {
   while (pos < str.length()) {
     int c = UTF16.charAt(str, pos);
     if (!UCharacterProperty.isRuleWhiteSpace(c)) {
       break;
     }
     pos += UTF16.getCharCount(c);
   }
   return pos;
 }
 /**
  * Returns the code point at index, and increments to the next code point (post-increment
  * semantics). If index does not point to a valid surrogate pair, the behavior is the same as
  * <code>next()</code>. Otherwise the iterator is incremented past the surrogate pair, and the
  * code point represented by the pair is returned.
  *
  * @return the next codepoint in text, or DONE if the index is at the limit of the text.
  * @stable ICU 2.4
  */
 public int nextCodePoint() {
   int ch1 = next();
   if (UTF16.isLeadSurrogate((char) ch1)) {
     int ch2 = next();
     if (UTF16.isTrailSurrogate((char) ch2)) {
       return UCharacterProperty.getRawSupplementary((char) ch1, (char) ch2);
     } else if (ch2 != DONE) {
       // unmatched surrogate so back out
       previous();
     }
   }
   return ch1;
 }
Ejemplo n.º 3
0
  /**
   * Convert an escape to a 32-bit code point value. We attempt to parallel the icu4c unescapeAt()
   * function.
   *
   * @param offset16 an array containing offset to the character <em>after</em> the backslash. Upon
   *     return offset16[0] will be updated to point after the escape sequence.
   * @return character value from 0 to 10FFFF, or -1 on error.
   */
  public static int unescapeAt(String s, int[] offset16) {
    int c;
    int result = 0;
    int n = 0;
    int minDig = 0;
    int maxDig = 0;
    int bitsPerDigit = 4;
    int dig;
    int i;
    boolean braces = false;

    /* Check that offset is in range */
    int offset = offset16[0];
    int length = s.length();
    if (offset < 0 || offset >= length) {
      return -1;
    }

    /* Fetch first UChar after '\\' */
    c = UTF16.charAt(s, offset);
    offset += UTF16.getCharCount(c);

    /* Convert hexadecimal and octal escapes */
    switch (c) {
      case 'u':
        minDig = maxDig = 4;
        break;
      case 'U':
        minDig = maxDig = 8;
        break;
      case 'x':
        minDig = 1;
        if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
          ++offset;
          braces = true;
          maxDig = 8;
        } else {
          maxDig = 2;
        }
        break;
      default:
        dig = UCharacter.digit(c, 8);
        if (dig >= 0) {
          minDig = 1;
          maxDig = 3;
          n = 1; /* Already have first octal digit */
          bitsPerDigit = 3;
          result = dig;
        }
        break;
    }
    if (minDig != 0) {
      while (offset < length && n < maxDig) {
        c = UTF16.charAt(s, offset);
        dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
        if (dig < 0) {
          break;
        }
        result = (result << bitsPerDigit) | dig;
        offset += UTF16.getCharCount(c);
        ++n;
      }
      if (n < minDig) {
        return -1;
      }
      if (braces) {
        if (c != 0x7D /*}*/) {
          return -1;
        }
        ++offset;
      }
      if (result < 0 || result >= 0x110000) {
        return -1;
      }
      // If an escape sequence specifies a lead surrogate, see
      // if there is a trail surrogate after it, either as an
      // escape or as a literal.  If so, join them up into a
      // supplementary.
      if (offset < length && UTF16.isLeadSurrogate((char) result)) {
        int ahead = offset + 1;
        c = s.charAt(offset); // [sic] get 16-bit code unit
        if (c == '\\' && ahead < length) {
          int o[] = new int[] {ahead};
          c = unescapeAt(s, o);
          ahead = o[0];
        }
        if (UTF16.isTrailSurrogate((char) c)) {
          offset = ahead;
          result = UCharacterProperty.getRawSupplementary((char) result, (char) c);
        }
      }
      offset16[0] = offset;
      return result;
    }

    /* Convert C-style escapes in table */
    for (i = 0; i < UNESCAPE_MAP.length; i += 2) {
      if (c == UNESCAPE_MAP[i]) {
        offset16[0] = offset;
        return UNESCAPE_MAP[i + 1];
      } else if (c < UNESCAPE_MAP[i]) {
        break;
      }
    }

    /* Map \cX to control-X: X & 0x1F */
    if (c == 'c' && offset < length) {
      c = UTF16.charAt(s, offset);
      offset16[0] = offset + UTF16.getCharCount(c);
      return 0x1F & c;
    }

    /* If no special forms are recognized, then consider
     * the backslash to generically escape the next character. */
    offset16[0] = offset;
    return c;
  }