/** * Skip over a sequence of zero or more white space characters at pos. Return the index of the * first non-white-space character at or after pos, or str.length(), if there is none. */ public static int skipWhitespace(String str, int pos) { while (pos < str.length()) { int c = UTF16.charAt(str, pos); if (!UCharacterProperty.isRuleWhiteSpace(c)) { break; } pos += UTF16.getCharCount(c); } return pos; }
/** * Returns the code point at index, and increments to the next code point (post-increment * semantics). If index does not point to a valid surrogate pair, the behavior is the same as * <code>next()</code>. Otherwise the iterator is incremented past the surrogate pair, and the * code point represented by the pair is returned. * * @return the next codepoint in text, or DONE if the index is at the limit of the text. * @stable ICU 2.4 */ public int nextCodePoint() { int ch1 = next(); if (UTF16.isLeadSurrogate((char) ch1)) { int ch2 = next(); if (UTF16.isTrailSurrogate((char) ch2)) { return UCharacterProperty.getRawSupplementary((char) ch1, (char) ch2); } else if (ch2 != DONE) { // unmatched surrogate so back out previous(); } } return ch1; }
/** * Convert an escape to a 32-bit code point value. We attempt to parallel the icu4c unescapeAt() * function. * * @param offset16 an array containing offset to the character <em>after</em> the backslash. Upon * return offset16[0] will be updated to point after the escape sequence. * @return character value from 0 to 10FFFF, or -1 on error. */ public static int unescapeAt(String s, int[] offset16) { int c; int result = 0; int n = 0; int minDig = 0; int maxDig = 0; int bitsPerDigit = 4; int dig; int i; boolean braces = false; /* Check that offset is in range */ int offset = offset16[0]; int length = s.length(); if (offset < 0 || offset >= length) { return -1; } /* Fetch first UChar after '\\' */ c = UTF16.charAt(s, offset); offset += UTF16.getCharCount(c); /* Convert hexadecimal and octal escapes */ switch (c) { case 'u': minDig = maxDig = 4; break; case 'U': minDig = maxDig = 8; break; case 'x': minDig = 1; if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { ++offset; braces = true; maxDig = 8; } else { maxDig = 2; } break; default: dig = UCharacter.digit(c, 8); if (dig >= 0) { minDig = 1; maxDig = 3; n = 1; /* Already have first octal digit */ bitsPerDigit = 3; result = dig; } break; } if (minDig != 0) { while (offset < length && n < maxDig) { c = UTF16.charAt(s, offset); dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; offset += UTF16.getCharCount(c); ++n; } if (n < minDig) { return -1; } if (braces) { if (c != 0x7D /*}*/) { return -1; } ++offset; } if (result < 0 || result >= 0x110000) { return -1; } // If an escape sequence specifies a lead surrogate, see // if there is a trail surrogate after it, either as an // escape or as a literal. If so, join them up into a // supplementary. if (offset < length && UTF16.isLeadSurrogate((char) result)) { int ahead = offset + 1; c = s.charAt(offset); // [sic] get 16-bit code unit if (c == '\\' && ahead < length) { int o[] = new int[] {ahead}; c = unescapeAt(s, o); ahead = o[0]; } if (UTF16.isTrailSurrogate((char) c)) { offset = ahead; result = UCharacterProperty.getRawSupplementary((char) result, (char) c); } } offset16[0] = offset; return result; } /* Convert C-style escapes in table */ for (i = 0; i < UNESCAPE_MAP.length; i += 2) { if (c == UNESCAPE_MAP[i]) { offset16[0] = offset; return UNESCAPE_MAP[i + 1]; } else if (c < UNESCAPE_MAP[i]) { break; } } /* Map \cX to control-X: X & 0x1F */ if (c == 'c' && offset < length) { c = UTF16.charAt(s, offset); offset16[0] = offset + UTF16.getCharCount(c); return 0x1F & c; } /* If no special forms are recognized, then consider * the backslash to generically escape the next character. */ offset16[0] = offset; return c; }