/** * Gets the character extended type * * @param ch character to be tested * @return extended type it is associated with */ private static int getType(int ch) { if (UCharacterUtility.isNonCharacter(ch)) { // not a character we return a invalid category count return NON_CHARACTER_; } int result = UCharacter.getType(ch); if (result == UCharacterCategory.SURROGATE) { if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { result = LEAD_SURROGATE_; } else { result = TRAIL_SURROGATE_; } } return result; }
public static CasingType from(String s) { if (s == null || s.length() == 0) { return other; } int cp; // Look for the first meaningful character in the string to determine case. for (int i = 0; i < s.length(); i += Character.charCount(cp)) { cp = s.codePointAt(i); // used to skip the placeholders, but works better to have them be 'other' // if (cp == '{') { // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { // i = placeholder.end() - 1; // skip // continue; // } // } int type = UCharacter.getType(cp); switch (type) { case UCharacter.LOWERCASE_LETTER: return lowercase; case UCharacter.UPPERCASE_LETTER: case UCharacter.TITLECASE_LETTER: return titlecase; // for other letters / numbers / symbols, return other case UCharacter.OTHER_LETTER: case UCharacter.DECIMAL_DIGIT_NUMBER: case UCharacter.LETTER_NUMBER: case UCharacter.OTHER_NUMBER: case UCharacter.MATH_SYMBOL: case UCharacter.CURRENCY_SYMBOL: case UCharacter.MODIFIER_SYMBOL: case UCharacter.OTHER_SYMBOL: return other; // ignore everything else (whitespace, punctuation, etc) and keep going } } return other; }
/** * Set an identifier to analyze. Afterwards, call methods like getScripts() * * @param identifier the identifier to analyze * @return self * @internal * @deprecated This API is ICU internal only. */ @Deprecated public IdentifierInfo setIdentifier(String identifier) { this.identifier = identifier; clear(); BitSet scriptsForCP = new BitSet(); int cp; for (int i = 0; i < identifier.length(); i += Character.charCount(i)) { cp = Character.codePointAt(identifier, i); // Store a representative character for each kind of decimal digit if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { // Just store the zero character as a representative for comparison. Unicode guarantees it // is cp - value numerics.add(cp - UCharacter.getNumericValue(cp)); } UScript.getScriptExtensions(cp, scriptsForCP); scriptsForCP.clear(UScript.COMMON); scriptsForCP.clear(UScript.INHERITED); // if (temp.cardinality() == 0) { // // HACK for older version of ICU // requiredScripts.set(UScript.getScript(cp)); // } else switch (scriptsForCP.cardinality()) { case 0: break; case 1: // Single script, record it. requiredScripts.or(scriptsForCP); break; default: if (!requiredScripts.intersects(scriptsForCP) && scriptSetSet.add(scriptsForCP)) { scriptsForCP = new BitSet(); } break; } } // Now make a final pass through to remove alternates that came before singles. // [Kana], [Kana Hira] => [Kana] // This is relatively infrequent, so doesn't have to be optimized. // We also compute any commonalities among the alternates. if (scriptSetSet.size() > 0) { commonAmongAlternates.set(0, UScript.CODE_LIMIT); for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext(); ) { final BitSet next = it.next(); // [Kana], [Kana Hira] => [Kana] if (requiredScripts.intersects(next)) { it.remove(); } else { // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] commonAmongAlternates.and(next); // get the intersection. for (BitSet other : scriptSetSet) { if (next != other && contains(next, other)) { it.remove(); break; } } } } } if (scriptSetSet.size() == 0) { commonAmongAlternates.clear(); } return this; }