private static void expandSingleRule( StringBuilder builder, String leftHandSide, String rightHandSide) throws IllegalArgumentException { UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE); boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches(); for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) { if (it.codepoint != UnicodeSetIterator.IS_STRING) { if (numericValue) { for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) { builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>'); builder.append( String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp))); builder.append(" # ").append(UCharacter.getName(cp)); builder.append("\n"); } } else { builder.append(String.format(Locale.ROOT, "%04X", it.codepoint)); if (it.codepointEnd > it.codepoint) { builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd)); } builder.append('>').append(rightHandSide).append("\n"); } } else { System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet"); System.exit(1); } } }
// If you don't need any file initialization or postprocessing, you only need this one routine public CheckCLDR handleCheck( String path, String fullPath, String value, Options options, List<CheckStatus> result) { // it helps performance to have a quick reject of most paths if (fullPath == null) return this; // skip paths that we don't have if (fullPath.indexOf("casing") < 0) return this; // pick up the casing attributes from the full path parts.set(fullPath); Case caseTest = Case.mixed; for (int i = 0; i < parts.size(); ++i) { String casingValue = parts.getAttributeValue(i, "casing"); if (casingValue == null) { continue; } caseTest = Case.forString(casingValue); if (caseTest == Case.verbatim) { return this; // we're done } } String newValue = value; switch (caseTest) { case lowercase_words: newValue = UCharacter.toLowerCase(uLocale, value); break; case titlecase_words: newValue = UCharacter.toTitleCase(uLocale, value, null); break; case titlecase_firstword: newValue = TitleCaseFirst(uLocale, value); break; default: break; } if (!newValue.equals(value)) { // the following is how you signal an error or warning (or add a demo....) result.add( new CheckStatus() .setCause(this) .setMainType(CheckStatus.errorType) .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType .setMessage( "Casing incorrect: either should have casing=\"verbatim\" or be <{0}>", new Object[] {newValue})); // the message; can be MessageFormat with arguments } return this; }
public Object invoke( ModuleInstance module, Object source, Object[] args, QvtOperationalEvaluationEnv evalEnv) { String self = (String) source; self = self.trim(); StringBuilder buf = new StringBuilder(self.length()); boolean isWhiteArea = false; for (int i = 0; i < self.length(); i++) { char c = self.charAt(i); if (UCharacter.isWhitespace(c)) { if (isWhiteArea) { continue; } isWhiteArea = true; buf.append(c); } else { isWhiteArea = false; buf.append(c); } } return buf.toString(); }
public void run() { for (int i = 0; i < 10000; i++) { actualName = UCharacter.getName(codePoint); if (!correctName.equals(actualName)) { break; } } }
private static void getNFKCDataFilesFromIcuProject() throws IOException { URL icuTagsURL = new URL(ICU_SVN_TAG_URL + "/"); URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/"); URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/"); System.err.print("Downloading " + NFKC_TXT + " ... "); download(new URL(norm2url, NFKC_TXT), NFKC_TXT); System.err.println("done."); System.err.print("Downloading " + NFKC_CF_TXT + " ... "); download(new URL(norm2url, NFKC_CF_TXT), NFKC_CF_TXT); System.err.println("done."); System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... "); URLConnection connection = openConnection(new URL(norm2url, NFC_TXT)); BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8")); Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), "UTF-8"); try { String line; while (null != (line = reader.readLine())) { Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line); if (matcher.matches()) { final String leftHandSide = matcher.group(1); final String rightHandSide = matcher.group(2).trim(); List<String> diacritics = new ArrayList<String>(); for (String outputCodePoint : rightHandSide.split("\\s+")) { int ch = Integer.parseInt(outputCodePoint, 16); if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC) // gennorm2 fails if U+0653-U+0656 are included in round-trip mappings || (ch >= 0x653 && ch <= 0x656)) { diacritics.add(outputCodePoint); } } if (!diacritics.isEmpty()) { StringBuilder replacementLine = new StringBuilder(); replacementLine.append(leftHandSide).append(">").append(rightHandSide); replacementLine.append(" # one-way: diacritic"); if (diacritics.size() > 1) { replacementLine.append("s"); } for (String diacritic : diacritics) { replacementLine.append(" ").append(diacritic); } line = replacementLine.toString(); } } writer.write(line); writer.write("\n"); } } finally { reader.close(); writer.close(); } System.err.println("done."); }
public Object invoke( ModuleInstance module, Object source, Object[] args, QvtOperationalEvaluationEnv evalEnv) { Object leftVal = args[0]; if (leftVal == null && leftVal == CallHandlerAdapter.getInvalidResult(evalEnv)) { return false; } String self = (String) source; for (int i = 0; i < self.length(); i++) { char c = self.charAt(i); if ((i == 0 && !UCharacter.isLetter(c)) || !UCharacter.isLetterOrDigit(c)) { return Boolean.FALSE; } } return Boolean.TRUE; }
private String TitleCaseFirst(ULocale locale, String value) { if (value.length() == 0) { return value; } breaker.setText(value); breaker.first(); int endOfFirstWord = breaker.next(); return UCharacter.toTitleCase(uLocale, value.substring(0, endOfFirstWord), breaker) + value.substring(endOfFirstWord); }
@Override public void checkValid(CharSequence literal) throws DatatypeException { String[] keylabels = literal.toString().split("\\s+"); Arrays.sort(keylabels); for (int i = 0; i < keylabels.length; i++) { String label = keylabels[i]; if (i > 0 && label.equals(keylabels[i - 1])) { throw newDatatypeException("Duplicate key label. Each key label must be unique."); } if (label.length() == 2) { char[] chars = label.toCharArray(); if (!(UCharacter.isHighSurrogate(chars[0]) && UCharacter.isLowSurrogate(chars[1]))) { throw newDatatypeException( "Key label has multiple characters. Each key label must be a single character."); } } if (label.length() > 2) { throw newDatatypeException( "Key label has multiple characters. Each key label must be a single character."); } } }
/** * Gets the character extended type * * @param ch character to be tested * @return extended type it is associated with */ private static int getType(int ch) { if (UCharacterUtility.isNonCharacter(ch)) { // not a character we return a invalid category count return NON_CHARACTER_; } int result = UCharacter.getType(ch); if (result == UCharacterCategory.SURROGATE) { if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { result = LEAD_SURROGATE_; } else { result = TRAIL_SURROGATE_; } } return result; }
// // Test multi-threaded parallel calls to UCharacter.getName(codePoint) // Regression test for ticket 6264. // public void TestUCharactersGetName() throws InterruptedException { List threads = new LinkedList(); for (int t = 0; t < 20; t++) { int codePoint = 47 + t; String correctName = UCharacter.getName(codePoint); GetNameThread thread = new GetNameThread(codePoint, correctName); thread.start(); threads.add(thread); } ListIterator i = threads.listIterator(); while (i.hasNext()) { GetNameThread thread = (GetNameThread) i.next(); thread.join(); if (!thread.correctName.equals(thread.actualName)) { errln("FAIL, expected \"" + thread.correctName + "\", got \"" + thread.actualName + "\""); } } }
// TODO: only bubblesort around runs of combining marks, instead of the entire text. private void ccReorder(char[] text, int start, int length) { boolean reordered; do { int prevCC = 0; reordered = false; for (int i = start; i < start + length; i++) { final char c = text[i]; final int cc = UCharacter.getCombiningClass(c); if (cc > 0 && cc < prevCC) { // swap text[i] = text[i - 1]; text[i - 1] = c; reordered = true; } else { prevCC = cc; } } } while (reordered == true); }
public static CasingType from(String s) { if (s == null || s.length() == 0) { return other; } int cp; // Look for the first meaningful character in the string to determine case. for (int i = 0; i < s.length(); i += Character.charCount(cp)) { cp = s.codePointAt(i); // used to skip the placeholders, but works better to have them be 'other' // if (cp == '{') { // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { // i = placeholder.end() - 1; // skip // continue; // } // } int type = UCharacter.getType(cp); switch (type) { case UCharacter.LOWERCASE_LETTER: return lowercase; case UCharacter.UPPERCASE_LETTER: case UCharacter.TITLECASE_LETTER: return titlecase; // for other letters / numbers / symbols, return other case UCharacter.OTHER_LETTER: case UCharacter.DECIMAL_DIGIT_NUMBER: case UCharacter.LETTER_NUMBER: case UCharacter.OTHER_NUMBER: case UCharacter.MATH_SYMBOL: case UCharacter.CURRENCY_SYMBOL: case UCharacter.MODIFIER_SYMBOL: case UCharacter.OTHER_SYMBOL: return other; // ignore everything else (whitespace, punctuation, etc) and keep going } } return other; }
/** * Set an identifier to analyze. Afterwards, call methods like getScripts() * * @param identifier the identifier to analyze * @return self * @internal * @deprecated This API is ICU internal only. */ @Deprecated public IdentifierInfo setIdentifier(String identifier) { this.identifier = identifier; clear(); BitSet scriptsForCP = new BitSet(); int cp; for (int i = 0; i < identifier.length(); i += Character.charCount(i)) { cp = Character.codePointAt(identifier, i); // Store a representative character for each kind of decimal digit if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { // Just store the zero character as a representative for comparison. Unicode guarantees it // is cp - value numerics.add(cp - UCharacter.getNumericValue(cp)); } UScript.getScriptExtensions(cp, scriptsForCP); scriptsForCP.clear(UScript.COMMON); scriptsForCP.clear(UScript.INHERITED); // if (temp.cardinality() == 0) { // // HACK for older version of ICU // requiredScripts.set(UScript.getScript(cp)); // } else switch (scriptsForCP.cardinality()) { case 0: break; case 1: // Single script, record it. requiredScripts.or(scriptsForCP); break; default: if (!requiredScripts.intersects(scriptsForCP) && scriptSetSet.add(scriptsForCP)) { scriptsForCP = new BitSet(); } break; } } // Now make a final pass through to remove alternates that came before singles. // [Kana], [Kana Hira] => [Kana] // This is relatively infrequent, so doesn't have to be optimized. // We also compute any commonalities among the alternates. if (scriptSetSet.size() > 0) { commonAmongAlternates.set(0, UScript.CODE_LIMIT); for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext(); ) { final BitSet next = it.next(); // [Kana], [Kana Hira] => [Kana] if (requiredScripts.intersects(next)) { it.remove(); } else { // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] commonAmongAlternates.and(next); // get the intersection. for (BitSet other : scriptSetSet) { if (next != other && contains(next, other)) { it.remove(); break; } } } } } if (scriptSetSet.size() == 0) { commonAmongAlternates.clear(); } return this; }