public void TestJamo() throws IOException { { // CLDRTransforms.registerCldrTransforms(null, ".*(Jamo).*", out); String name = "Latin-ConjoiningJamo"; Transliterator fromLatin = Transliterator.getInstance(name); Transliterator toLatin = Transliterator.getInstance(name, Transliterator.REVERSE); UnicodeSet sourceSet = getRepresentativeHangul(); logln(sourceSet.size() + "\t" + sourceSet.toPattern(false)); Transliterator nfd = Transliterator.getInstance("nfd"); UnicodeSet multiply = new UnicodeSet(sourceSet); // for (UnicodeSetIterator it = new UnicodeSetIterator(sourceSet); it.next();) { // for (UnicodeSetIterator it2 = new UnicodeSetIterator(sourceSet); it2.next();) { // String source1 = it.getString() + it2.getString(); // try all combinations. // multiply.add(source1); // } // } // latin.addAll(toTarget.getSourceSet()) // .addAll(toTarget.getTargetSet()) // .addAll(fromTarget.getSourceSet()) // .addAll(fromTarget.getTargetSet()); // latin.retainAll(new UnicodeSet("[[:latin:][:common:][:inherited:]]")); // Transliterator.DEBUG = true; UnicodeSet specials = null; // new UnicodeSet("[{ch}]"); writeFile(name, multiply, nfd, toLatin, fromLatin, true, null, specials); } }
public File[] getJpegList(String userId, String slideName) { // ここ直してください。 String appRootPath = new PropertiesComponent().referProperties("appRootPath"); // String appRootPath = "C:/Users/tanese kenta/awaretweet/"; // 探索するパス String basePath = appRootPath + "slide/" + userId + "/" + slideName + "/"; File file = new File(basePath); File[] files = file.listFiles(); for (int i = 0; i < files.length; i++) { if (files[i].getName().startsWith(".")) { files[i].delete(); return null; } else { Path src = Paths.get(basePath + files[i].getName()); Transliterator transliterator = Transliterator.getInstance("Katakana-Latin"); String reName = transliterator.transliterate(files[i].getName()); Path srcRename = Paths.get(basePath + reName); try { Files.move(src, srcRename); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } File[] renameFiles = new File(basePath).listFiles(); Arrays.sort(renameFiles, new FileSort()); return renameFiles; }
public static void main(String[] args) { int count = 0; Enumeration<String> targets = Transliterator.getAvailableIDs(); while (targets.hasMoreElements()) { String s = (String) targets.nextElement(); System.out.println(s); count++; } System.out.println("number " + count); // System.exit(0); // trans = Transliterator.getInstance("Any-en_US; nfd; // [\u0301\u0302\u0304\u0306\u0307\u0308\u030c\u0328] remove; nfc"); // [:nonspacing mark:] // remove; nfc"); trans = Transliterator.getInstance("Any-Latin"); // [:nonspacing mark:] remove; nfc"); decomposed = Transliterator.getInstance("Any-Latin; nfd"); // [:nonspacing mark:] remove; nfc"); for (int row = 0; row < 256; row++) { String name = String.format("row%02x.trans", row); PrintWriter out = null; try { out = new PrintWriter(new FileWriter(name)); printRow(out, row); } catch (IOException e) { System.out.println("Could not open " + name + " for write"); } catch (UselessException e) { // System.out.println("Deleting " + name); File f = new File(name); f.delete(); } finally { Utils.closeFile(out); } } }
/** * Clean string. * * @param str the str * @return the string */ public static String cleanString(String str) { Transliterator accentsconverter = Transliterator.getInstance("Latin; NFD; [:Nonspacing Mark:] Remove; NFC;"); str = accentsconverter.transliterate(str); // the character ? seems to not be changed to d by the transliterate // function StringBuffer cleanedStr = new StringBuffer(str.trim()); // delete special character for (int i = 0; i < cleanedStr.length(); i++) { char c = cleanedStr.charAt(i); if (c == ' ') { if (i > 0 && cleanedStr.charAt(i - 1) == '-') { cleanedStr.deleteCharAt(i--); } else { c = '-'; cleanedStr.setCharAt(i, c); } continue; } if (i > 0 && !(Character.isLetterOrDigit(c) || c == '-')) { cleanedStr.deleteCharAt(i--); continue; } if (i > 0 && c == '-' && cleanedStr.charAt(i - 1) == '-') cleanedStr.deleteCharAt(i--); } return cleanedStr.toString().toLowerCase(); }
private void assertRoundTripTransform( String message, String source, Transliterator lh, Transliterator hl) { String to = hl.transform(source); String back = lh.transform(to); String to2 = hl.transform(source.replaceAll("(.)", "$1 ").trim()); String to3 = hl.transform(back.replaceAll("(.)", "$1 ").trim()); assertEquals(message + " " + source + " [" + to + "/" + to2 + "/" + to3 + "]", source, back); }
private String normalizeToken(final String searchToken) { if (TransliteratorManager.init(null)) { final Transliterator normalizer = normalizer(); return normalizer.transliterate(searchToken); } else { // Do our best since the Transliterators aren't up yet. return searchToken.toLowerCase(); } }
public void TestHangul2() { // CLDRTransforms.registerCldrTransforms(null, ".*(Hangul|Jamo).*", out); Transliterator lh = Transliterator.getInstance("Latin-Hangul"); Transliterator hl = lh.getInverse(); // assertRoundTripTransform("Transform", "\uAC0D\uD0C0", lh, hl); // assertRoundTripTransform("Transform", "\uAC0D\uB530", lh, hl); final UnicodeSet representativeHangul = getRepresentativeHangul(); for (UnicodeSetIterator it = new UnicodeSetIterator(representativeHangul); it.next(); ) { assertRoundTripTransform("Transform", it.getString(), lh, hl); } assertTransform("Transform", "츠", lh, "ch"); assertTransform("Transform", "아따", lh, hl, "atta", "a-tta"); assertTransform("Transform", "아빠", lh, hl, "appa", "a-ppa"); assertTransform("Transform", "아짜", lh, hl, "ajja", "a-jja"); assertTransform("Transform", "아까", lh, hl, "akka", "a-kka"); assertTransform("Transform", "아싸", lh, hl, "assa", "a-ssa"); assertTransform("Transform", "아차", lh, hl, "acha", "a-cha"); assertTransform("Transform", "악사", lh, hl, "agsa", "ag-sa"); assertTransform("Transform", "안자", lh, hl, "anja", "an-ja"); assertTransform("Transform", "안하", lh, hl, "anha", "an-ha"); assertTransform("Transform", "알가", lh, hl, "alga", "al-ga"); assertTransform("Transform", "알마", lh, hl, "alma", "al-ma"); assertTransform("Transform", "알바", lh, hl, "alba", "al-ba"); assertTransform("Transform", "알사", lh, hl, "alsa", "al-sa"); assertTransform("Transform", "알타", lh, hl, "alta", "al-ta"); assertTransform("Transform", "알파", lh, hl, "alpa", "al-pa"); assertTransform("Transform", "알하", lh, hl, "alha", "al-ha"); assertTransform("Transform", "압사", lh, hl, "absa", "ab-sa"); assertTransform("Transform", "안가", lh, hl, "anga", "an-ga"); assertTransform("Transform", "악싸", lh, hl, "agssa", "ag-ssa"); assertTransform("Transform", "안짜", lh, hl, "anjja", "an-jja"); assertTransform("Transform", "알싸", lh, hl, "alssa", "al-ssa"); assertTransform("Transform", "알따", lh, hl, "altta", "al-tta"); assertTransform("Transform", "알빠", lh, hl, "alppa", "al-ppa"); assertTransform("Transform", "압싸", lh, hl, "abssa", "ab-ssa"); assertTransform("Transform", "앆카", lh, hl, "akkka", "akk-ka"); assertTransform("Transform", "았사", lh, hl, "asssa", "ass-sa"); // 1. Latin->Hangul transliterator maps 'ch' to '킇' (splitting the sequence // into // 'c' and 'h' and inserting an implicit vowel 'ㅡ'). It'd be better to map a // *stand-alone* 'ch' to '츠' // // 2. As mentioned in http://www.unicode.org/cldr/transliteration_guidelines.html // (Korean section), // // - altta = alt-ta 앑타 should be ' al-tta 알따' // // - alppa = alp-pa 앒파 : should be 'al-ppa 알빠' }
private static int checkLatin(PrintWriter out, Transliterator fromLatin, Transliterator toLatin) { int errorCount = 0; for (UnicodeSetIterator it = new UnicodeSetIterator(latin); it.next(); ) { String source = it.getString(); String to = fromLatin.transliterate(source); if (latin.containsSome(to)) { String from = toLatin.transliterate(to); String backto = toLatin.transliterate(from); errorCount += showItems(out, false, source, to, from, backto); } } return errorCount; }
public IcuTransformTokenFilterFactory( IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.id = settings.get("id", "Null"); String s = settings.get("dir", "forward"); this.dir = "forward".equals(s) ? Transliterator.FORWARD : Transliterator.REVERSE; this.transliterator = Transliterator.getInstance(id, dir); }
public void xTestTamil() throws IOException { { // CLDRTransforms.registerCldrTransforms(null, ".*(Tamil).*", out); String name = "Tamil-Devanagari"; Transliterator tamil_devanagari = Transliterator.getInstance(name); Transliterator devanagari_tamil = Transliterator.getInstance(name, Transliterator.REVERSE); writeFile( name, new UnicodeSet("[[:block=tamil:]-[ௗ]]"), null, tamil_devanagari, devanagari_tamil, false, null, null); } }
private static int checkString( PrintWriter out, String source1, Transliterator nfd, Transliterator fromLatin, Transliterator toLatin, int errorCount, String separator) { String source = nfd == null ? source1 : nfd.transliterate(source1); String to = toLatin.transliterate(source); String from = fromLatin.transliterate(to); if (separator == null) { final boolean bad = !source.equals(from); if (bad || verbose) { String backto = toLatin.transliterate(from); if (bad) { errorCount += 1; } showItems(out, false, source, to, from, backto, bad ? "FAIL" : null); } else { // showItems(out, source, to, from, "OK"); } } else { if (to.contains(separator)) { // check separators, only put in when needed String otherTo = to.replace("-", ""); String otherFrom = fromLatin.transliterate(otherTo); final boolean bad = otherFrom.equals(from); if (bad) { // String backto = toLatin.transliterate(from); errorCount += 1; showItems(out, false, source, to, from, otherTo, otherFrom, bad ? "FAIL" : null); } } } return errorCount; }
private static void printRow(PrintWriter out, int row) throws UselessException { out.println("#"); out.println("# This is a table for transliterating characters."); out.println("# It was created using icu4j"); out.println("#"); out.println("# All resulting strings that contained characters outside the"); out.println("# range of iso 8859-1 are commented out"); out.println("#"); out.println(); int count = 0; for (int i = 0; i < 256; i++) { char c = (char) ((row << 8) + i); String single = "" + c; String result = trans.transliterate(single); if (result.length() == 1 && result.charAt(0) == c) result = "?"; else count++; boolean inRange = true; for (char rc : result.toCharArray()) { if (rc > 0xff) { // System.out.printf("out of range result %c for row %d\n", rc, row); inRange = false; break; } } if (!inRange) { count--; out.print("#"); } out.format("U+%02x%02x %-12.12s # Character %s", row, i, result, single); // if (!inRange) { // String s = decomposed.transliterate(single); // out.format(", %s", s); // for (char rc : s.toCharArray()) { // out.format(" %04x", (int) rc); // } // } out.println(); } if (count == 0) throw new UselessException(); }
public void TestChinese() { // CLDRTransforms.registerCldrTransforms(null, ".*(Han|Pinyin).*", out); Transliterator hanLatin = Transliterator.getInstance("Han-Latin"); assertTransform("Transform", "zào Unicode", hanLatin, "造Unicode"); assertTransform("Transform", "zài chuàng zào Unicode zhī qián", hanLatin, "在創造Unicode之前"); }
public static void main(String[] args) { /*x ShowTransliterations.1 */ Enumeration<String> idEnum = Transliterator.getAvailableIDs(); while (idEnum.hasMoreElements()) System.out.println(idEnum.nextElement()); /*x*/ }
/** Deferred initialization because it can be slow. */ public synchronized Transliterator normalizer() { if (normalizer == null) { normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD); } return normalizer; }
/** Class for detecting and converting Zawgyi-encoded data. */ public class MyanmarZawgyiConverter { // For detecting if Myanmar text is encoded with Zawgyi vs. Unicode characters. private static final Pattern ZAWGYI_DETECT_PATTERN = PatternCache.get( // A regular expression matched if text is Zawgyi encoding. // Using the ranges 1033-1034 or 1060-1097 will report Shan, Karen, // etc. as Zawgyi. "[\u105a\u1060-\u1097]|" // Zawgyi characters outside Unicode range + "[\u1033\u1034]|" // These are Mon characters + "\u1031\u108f|" + "\u1031[\u103b-\u103e]|" // Medial right after \u1031 + "[\u102b-\u1030\u1032]\u1031|" // Vowel sign right after before \u1031 + " \u1031| \u103b|" // Unexpected characters after a space + "^\u1031|^\u103b|\u1038\u103b|\u1038\u1031|" + "[\u102d\u102e\u1032]\u103b|\u1039[^\u1000-\u1021]|\u1039$" + "|\u1004\u1039[\u1001-\u102a\u103f\u104e]" // Missing ASAT in Kinzi + "|\u1039[^u1000-\u102a\u103f\u104e]" // 1039 not before a consonant // Out of order medials + "|\u103c\u103b|\u103d\u103b" + "|\u103e\u103b|\u103d\u103c" + "|\u103e\u103c|\u103e\u103d" // Bad medial combos + "|\u103b\u103c" // Out of order vowel signs + "|[\u102f\u1030\u102b\u102c][\u102d\u102e\u1032]" + "|[\u102b\u102c][\u102f\u102c]" // Digit before diacritic + "|[\u1040-\u1049][\u102b-\u103e\u102b-\u1030\u1032\u1036\u1037\u1038\u103a]" // Single digit 0, 7 at start + "|^[\u1040\u1047][^\u1040-\u1049]" // Second 1039 with bad followers + "|[\u1000-\u102a\u103f\u104e]\u1039[\u101a\u101b\u101d\u101f\u1022-\u103f]" // Other bad combos. + "|\u103a\u103e" + "|\u1036\u102b]" // multiple upper vowels + "|\u102d[\u102e\u1032]|\u102e[\u102d\u1032]|\u1032[\u102d\u102e]" // Multiple lower vowels + "|\u102f\u1030|\u1030\u102f" // Multiple A vowels + "|\u102b\u102c|\u102c\u102b" // Shan digits with vowels or medials or other signs + "|[\u1090-\u1099][\u102b-\u1030\u1032\u1037\u103a-\u103e]" // Isolated Shan digit + "|[\u1000-\u10f4][\u1090-\u1099][\u1000-\u104f]" + "|^[\u1090-\u1099][\u1000-\u102a\u103f\u104e\u104a\u104b]" + "|[\u1000-\u104f][\u1090-\u1099]$" // Diacritics with non-Burmese vowel signs + "|[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074\u1082-\u108d" + "\u108f\u109a-\u109d]" + "[\u102b-\u103e]" // Consonant 103a + some vowel signs + "|[\u1000-\u102a]\u103a[\u102d\u102e\u1032]" // 1031 after other vowel signs + "|[\u102b-\u1030\u1032\u1036-\u1038\u103a]\u1031" // Using Shan combining characters with other languages. + "|[\u1087-\u108d][\u106e-\u1070\u1072-\u1074]" // Non-Burmese diacritics at start, following space, or following sections + "|^[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074" + "\u1082-\u108d\u108f\u109a-\u109d]" + "|[\u0020\u104a\u104b][\u105e-\u1060\u1062-\u1064\u1067-\u106d" + "\u1071-\u1074\u1082-\u108d\u108f\u109a-\u109d]" // Wrong order with 1036 + "|[\u1036\u103a][\u102d-\u1030\u1032]" // Odd stacking + "|[\u1025\u100a]\u1039" // More mixing of non-Burmese languages + "|[\u108e-\u108f][\u1050-\u108d]" // Bad diacritic combos. + "|\u102d-\u1030\u1032\u1036-\u1037]\u1039]" // Dot before subscripted consonant + "|[\u1000-\u102a\u103f\u104e]\u1037\u1039" // Odd subscript + vowel signs + "|[\u1000-\u102a\u103f\u104e]\u102c\u1039[\u1000-\u102a\u103f\u104e]" // Medials after vowels + "|[\u102b-\u1030\u1032][\u103b-\u103e]" // Medials + "|\u1032[\u103b-\u103e]" // Medial with 101b + "|\u101b\u103c" // Stacking too deeply: consonant 1039 consonant 1039 consonant + "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]\u1039" + "[\u1000-\u102a\u103f\u104e]" // Stacking pattern consonant 1039 consonant 103a other vowel signs + "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]" + "[\u102b\u1032\u103d]" // Odd stacking over u1021, u1019, and u1000 + "|[\u1000\u1005\u100f\u1010\u1012\u1014\u1015\u1019\u101a]\u1039\u1021" + "|[\u1000\u1010]\u1039\u1019" + "|\u1004\u1039\u1000" + "|\u1015\u1039[\u101a\u101e]" + "|\u1000\u1039\u1001\u1036" + "|\u1039\u1011\u1032" // Vowel sign in wrong order + "|\u1037\u1032" + "|\u1036\u103b" // Duplicated vowel + "|\u102f\u102f"); // Transliteration to convert Burmese text in Zawgyi-encoded string to // standard Unicode codepoints and ordering. static final Transform<String, String> zawgyiUnicodeTransliterator = // Transliteration rules, 07-Jan-2014. Transliterator.createFromRules( "zawgyi-unicode", // Modern Burmese digits & Unicode code points. "$nondigits = [^\u1040-\u1049];" + "$space = ' ';" + "$consonant = [\u1000-\u1021];" + "$vowelsign = [\u102B-\u1030\u1032];" + "$umedial = [\u103B-\u103E];" + "$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F];" + "$ukinzi = \u1004\u103A\u1039;" + "$zmedialra = [\u103B\u107E-\u1084];" // #### STAGE (1): CODEPOINT MAPPING FROM ZAWGYI TO UNICODE + "($consonant) \u103A \u1064 > $ukinzi $1 \u103B;" + "($consonant) \u1064 > $ukinzi $1;" + "\u1064 > $ukinzi;" + "($consonant) \u108b > $ukinzi $1 \u102D;" + "($consonant) \u108C > $ukinzi $1 \u102E;" + "($consonant) \u108D > $ukinzi $1 \u1036;" + "($consonant) \u103A \u1033 \u108B > $ukinzi $1 \u103B \u102D \u102F;" + "($consonant) \u103A \u108b > $ukinzi $1 \u103B \u102D ;" + "($consonant) \u103A \u108C \u1033 > $ukinzi $1 \u103B \u102E \u102F;" + "($consonant) \u103A \u108C > $ukinzi $1 \u103B \u102E ;" + "($consonant) \u103A \u108D > $ukinzi $1 \u103B \u1036 ;" + "($consonant) \u103A \u108e > $1 \u103B \u102D \u1036 ;" + "\u108B > $ukinzi \u102D ;" + "\u108C > $ukinzi \u102E ;" + "\u108D > $ukinzi \u1036 ;" + "\u106A ($vowelsign) \u1038 > \u1025 $1 \u1038 ;" + "\u106A > \u1009 ;" + "\u106B > \u100A ;" + "\u108F > \u1014 ;" + "\u1090 > \u101B ;" + "\u1086 > \u103F ;" + "\u103A > \u103B ;" + "\u107D > \u103B ;" + "\u103C \u108A > \u103D \u103E;" + "\u103C > \u103D ;" + "\u108A > \u103D \u103E ;" + "\u103D > \u103E ;" + "\u1087 > \u103E ;" + "\u1088 > \u103E \u102F ;" + "\u1089 > \u103E \u1030 ;" + "\u1039 > \u103A ;" + "\u1033 > \u102F ;" + "\u1034 > \u1030 ;" + "\u105A > \u102B \u103A ;" + "\u108E > \u102D \u1036 ;" + "\u1031 \u1094 ($consonant) \u103D > $1 \u103E \u1031 \u1037 ;" + "\u1094 > \u1037 ;" + "\u1095 > \u1037 ;" + "\u1025 \u1061 > \u1009 \u1039 \u1001;" + "\u1025 \u1062 > \u1009 \u1039 \u1002;" + "\u1025 \u1065 > \u1009 \u1039 \u1005;" + "\u1025 \u1068 > \u1009 \u1039 \u1007;" + "\u1025 \u1076 > \u1009 \u1039 \u1013;" + "\u1025 \u1078 > \u1009 \u1039 \u1015;" + "\u1025 \u107A > \u1009 \u1039 \u1017;" + "\u1025 \u1079 > \u1009 \u1039 \u1016;" + "\u1060 > \u1039 \u1000 ;" + "\u1061 > \u1039 \u1001 ;" + "\u1062 > \u1039 \u1002 ;" + "\u1063 > \u1039 \u1003 ;" + "\u1065 > \u1039 \u1005 ;" + "\u1066 > \u1039 \u1006 ;" + "\u1067 > \u1039 \u1006 ;" + "\u1068 > \u1039 \u1007 ;" + "\u1069 > \u1039 \u1008 ;" + "\u106C > \u1039 \u100B ;" + "\u106D > \u1039 \u100C ;" + "\u1070 > \u1039 \u100F ;" + "\u1071 > \u1039 \u1010 ;" + "\u1072 > \u1039 \u1010 ;" + "\u1096 > \u1039 \u1010 \u103D;" + "\u1073 > \u1039 \u1011 ;" + "\u1074 > \u1039 \u1011 ;" + "\u1075 > \u1039 \u1012 ;" + "\u1076 > \u1039 \u1013 ;" + "\u1077 > \u1039 \u1014 ;" + "\u1078 > \u1039 \u1015 ;" + "\u1079 > \u1039 \u1016 ;" + "\u107A > \u1039 \u1017 ;" + "\u107B > \u1039 \u1018 ;" + "\u1093 > \u1039 \u1018 ;" + "\u107C > \u1039 \u1019 ;" + "\u1085 > \u1039 \u101C ;" + "\u106E > \u100D\u1039\u100D ;" + "\u106F > \u100D\u1039\u100E ;" + "\u1091 > \u100F\u1039\u100D ;" + "\u1092 > \u100B\u1039\u100C ;" + "\u1097 > \u100B\u1039\u100B ;" + "\u104E > \u104E\u1004\u103A\u1038 ;" // #### STAGE (2): POST REORDERING RULES FOR UNICODE RENDERING + "::Null;" + "\u1044 \u103a > | \u104E \u103A ;" + "($nondigits) \u1040 ([\u102B-\u103F]) > $1 \u101D $2;" + "\u1031 \u1040 ($nondigits) > \u1031 \u101D $1;" + "\u1025 \u103A > \u1009 \u103A;" + "\u1025 \u102E > \u1026;" + "\u1037\u103A > \u103A\u1037;" + "\u1036 ($umedial*) ($vowelsign+) > $1 $2 \u1036 ;" + "([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) > $2 $1;" + "\u103C ($consonant) > $1 \u103C;" // #### Stage 3 + "::Null;" + "([\u1031]+) $ukinzi ($consonant) > $ukinzi $2 $1;" + "([\u1031]+) ($consonant) ($umedial+) > $2 $3 $1;" + "([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] > $2 $1;" + "\u103C \u103A \u1039 ($consonant) > \u103A \u1039 $1 \u103C;" + "\u1036 ($umedial+) > $1 \u1036;" // #### Stage 4 + "::Null;" + "([\u103C\u103D\u103E]+) \u103B > \u103B $1;" + "([\u103D\u103E]+) \u103C > \u103C $1;" + "\u103E\u103D > \u103D\u103E ;" + "([\u1031]+) ($vowelsign*) \u1039 ($consonant) > \u1039 $3 $1 $2;" + "($vowelsign+) \u1039 ($consonant) > \u1039 $2 $1;" + "($umedial*) ([\u1031]+) ($umedial*) > $1 $3 $2;" + "\u1037 ([\u102D-\u1030\u1032\u1036]) > $1 \u1037;" + "\u1037 ($umedial+) > $1 \u1037;" + "($vowelsign+) ($umedial+) > $2 $1;" + "($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant)> $1 \u103A $2 $3;" // #### Stage 5. More reorderings + "::Null;" + "([\u1031]+) ($umedial+) > $2 $1;" + "($vowelsign) ($umedial) > $2 $1;" + "([\u103C\u103D\u103E]) \u103B > \u103B $1;" + "([\u103D\u103E]) \u103C > \u103C $1;" + "\u103E\u103D > \u103D\u103E ;" + "\u1038 ([$vowelmedial]) > $1 \u1038;" + "\u1038 ([\u1036\u1037\u103A]) > $1 \u1038;" // ### Stage 6 + "::Null;" + "($consonant) \u103B \u103A > $1 \u103A \u103B;" + "([\u103C\u103D\u103E]) \u103B > \u103B $1;" + "([\u103D\u103E]) \u103C > \u103C $1;" + "\u103E\u103D > \u103D\u103E ;" + "([\u102D-\u1030\u1032]) \u103A ($consonant) \u103A > $1 $2 \u103A;" + "\u102F \u103A > \u102F;" + "\u102D \u102E > \u102E;" + "\u102F \u1030 > \u102F;" + "\u102B [\u102B]+ > \u102B;" + "\u102C [\u102C]+ > \u102C;" + "\u102D [\u102D]+ > \u102D;" + "\u102E [\u102E]+ > \u102E;" + "\u102F [\u102F]+ > \u102F;" + "\u1030 [\u1030]+ > \u1030;" + "\u1031 [\u1031]+ > \u1031;" + "\u1032 [\u1032]+ > \u1032;" + "\u103A [\u103A]+ > \u103A;" + "\u103B [\u103B]+ > \u103B;" + "\u103C [\u103C]+ > \u103C;" + "\u103D [\u103D]+ > \u103D;" + "\u103E [\u103E]+ > \u103E;" // Try to correctly render diacritics after a space. + "$space([\u102e\u1037\u103a]) > \u00A0 $1 ;", Transliterator.FORWARD); // TODO(ccorn): set a filter on this to restrict to range \u1000-\u109f ??? /** * Detects Zawgyi encoding in specified input. * * @param value the string to be tested * @return True if text is Zawgyi encoded. False if Unicode */ public static Boolean isZawgyiEncoded(String value) { Matcher matcher = ZAWGYI_DETECT_PATTERN.matcher(value); return matcher.find(); } /** * Converts Zawgyi-encoded string into Unicode equivalent. * * @param value the Zawgyi string to be converted * @return the Unicode string from converstion */ public static String convertZawgyiToUnicode(String value) { return zawgyiUnicodeTransliterator.transform(value); } /** * Normalizes Burmese characters in specified input, detecting and converting Zawgyi encoding to * Unicode form. * * @param value the string to be normalized * @return the normalized Unicode string */ public static String standardizeMyanmar(String value) { if (isZawgyiEncoded(value)) { // Call the converter to produce a Unicode result. return zawgyiUnicodeTransliterator.transform(value); } return value; // Unchanged since it was not Zawgyi. } }
/** Utility to generate the Tansliteration resource bundle files. */ public class ConvertTransforms extends CLDRConverterTool { private static final int HELP1 = 0, HELP2 = 1, SOURCEDIR = 2, DESTDIR = 3, MATCH = 4, SKIP_COMMENTS = 5, WRITE_INDEX = 6, VERBOSE = 7; private static final UOption[] options = { UOption.HELP_H(), UOption.HELP_QUESTION_MARK(), UOption.SOURCEDIR().setDefault(CLDRPaths.COMMON_DIRECTORY + "transforms/"), UOption.DESTDIR().setDefault(CLDRPaths.GEN_DIRECTORY + "icu-transforms/"), UOption.create("match", 'm', UOption.REQUIRES_ARG).setDefault(".*"), UOption.create("commentSkip", 'c', UOption.NO_ARG), UOption.create("writeIndex", 'x', UOption.NO_ARG), UOption.VERBOSE(), }; static final String HELP_TEXT1 = "Use the following options" + XPathParts.NEWLINE + "-h or -?\t for this message" + XPathParts.NEWLINE + "-" + options[SOURCEDIR].shortName + "\t source directory. Default = -s" + CldrUtility.getCanonicalName(CLDRPaths.MAIN_DIRECTORY) + XPathParts.NEWLINE + "\tExample:-sC:\\Unicode-CVS2\\cldr\\common\\gen\\source\\" + XPathParts.NEWLINE + "-" + options[DESTDIR].shortName + "\t destination directory. Default = -d" + CldrUtility.getCanonicalName(CLDRPaths.GEN_DIRECTORY + "main/") + XPathParts.NEWLINE + "-m<regex>\t to restrict the files to what matches <regex>" + XPathParts.NEWLINE // "--writeIndex / -x to write the index (trnsfiles.mk)"+ XPathParts.NEWLINE ; // TODO add options to set input and output directories, matching pattern public static void main(String[] args) throws Exception { ConvertTransforms ct = new ConvertTransforms(); ct.processArgs(args); } private boolean skipComments; private boolean writeIndex = false; private boolean verbose = false; int fileCount = 0; public void writeTransforms(String inputDirectory, String matchingPattern, String outputDirectory) throws IOException { System.out.println(new File(inputDirectory).getCanonicalPath()); Factory cldrFactory = Factory.make(inputDirectory, matchingPattern); Set<String> ids = cldrFactory.getAvailable(); PrintWriter index = BagFormatter.openUTF8Writer(outputDirectory, "root.txt"); doHeader(index, "//", "root.txt"); try { index.println("root {"); index.println(" RuleBasedTransliteratorIDs {"); // addAlias(index, "Latin", "el", "", "Latin", "Greek", "UNGEGN"); // addAlias(index, "el", "Latin", "", "Greek", "Latin", "UNGEGN"); // addAlias(index, "Latin", "Jamo", "", "Latin", "ConjoiningJamo", ""); addAlias(index, "Tone", "Digit", "", "Pinyin", "NumericPinyin", ""); addAlias(index, "Digit", "Tone", "", "NumericPinyin", "Pinyin", ""); // addAlias(index, "Simplified", "Traditional", "", "Hans", "Hant", ""); // addAlias(index, "Traditional", "Simplified", "", "Hant", "Hans", ""); for (String id : ids) { if (id.equals("All")) continue; try { convertFile(cldrFactory, id, outputDirectory, index); } catch (IOException e) { System.err.println("Failure in: " + id); throw e; } } index.println(" }"); index.println(" TransliteratorNamePattern {"); index.println(" // Format for the display name of a Transliterator."); index.println(" // This is the language-neutral form of this resource."); index.println(" \"{0,choice,0#|1#{1}|2#{1}-{2}}\" // Display name"); index.println(" }"); index.println(" // Transliterator display names"); index.println(" // This is the English form of this resource."); index.println(" \"%Translit%Hex\" { \"%Translit%Hex\" }"); index.println(" \"%Translit%UnicodeName\" { \"%Translit%UnicodeName\" }"); index.println(" \"%Translit%UnicodeChar\" { \"%Translit%UnicodeChar\" }"); index.println(" TransliterateLATIN{ "); index.println(" \"\","); index.println(" \"\""); index.println(" }"); index.println("}"); } finally { index.close(); } } public static PrintWriter makePrintWriter(ByteArrayOutputStream bytes) { try { OutputStreamWriter outStream = new OutputStreamWriter(bytes, "UTF-8"); BufferedWriter buff = new BufferedWriter(outStream, 4 * 1024); PrintWriter p = new PrintWriter(buff); return p; } catch (Exception e) { System.err.println("Error: Could not create OutputStreamWriter."); } return null; } private void showComments(PrintWriter toilet, String value) { String[] lines = value.trim().split("\\r\\n?|\\n"); for (String line : lines) { if (!line.startsWith("#")) { line = "# " + line; } toilet.println(line); } } private void convertFile( Factory cldrFactory, String id, String outputDirectory, PrintWriter index) throws IOException { PrintWriter output = null; String filename = null; CLDRFile cldrFile = cldrFactory.make(id, false); boolean first = true; for (Iterator<String> it = cldrFile.iterator("", cldrFile.getComparator()); it.hasNext(); ) { String path = it.next(); if (path.indexOf("/version") >= 0 || path.indexOf("/generation") >= 0) { continue; } String value = cldrFile.getStringValue(path); if (first) { String fullPath = cldrFile.getFullXPath(path); filename = addIndexInfo(index, fullPath, id); if (filename == null) return; // not a transform file! output = BagFormatter.openUTF8Writer(outputDirectory, filename); doHeader(output, "#", filename); first = false; } if (path.indexOf("/comment") >= 0) { if (!skipComments) { showComments(output, value); } } else if (path.indexOf("/tRule") >= 0) { value = fixup.transliterate(value); output.println(value); } else { throw new IllegalArgumentException("Unknown element: " + path + "\t " + value); } } output.close(); } public static final Transliterator fixup = Transliterator.getInstance("[:Mn:]any-hex/java"); public static String replaceUnquoted(String value, String toReplace, String replacement) { // quick exit in most cases if (value.indexOf(toReplace) < 0) return value; String updatedValue = ""; int segmentStart = 0; boolean inQuotes = false; boolean ignoreCharValue = false; int length = value.length(); for (int pos = 0; pos < length; ++pos) { char curChar = (char) 0; if (ignoreCharValue) { ignoreCharValue = false; } else { curChar = value.charAt(pos); } if (curChar == '\\') { // escape, ignore the value of the next char (actually the next UTF16 code unit, but that // works here) ignoreCharValue = true; } boolean isLastChar = (pos + 1 >= length); if (curChar == '\'' || isLastChar) { // quote, begin or end of a quoted literal (in which no replacement takes place) if (inQuotes) { // End of a quoted segment; guaranteed to include at least opening quote. // Just add the segment (including current char) to updatedValue. updatedValue = updatedValue + value.substring(segmentStart, pos + 1); segmentStart = pos + 1; } else { if (isLastChar) ++pos; if (pos > segmentStart) { // End of a nonempty unquoted segment; perform requested replacements and // then add segment to updatedValue. String currentSegment = value.substring(segmentStart, pos); updatedValue = updatedValue + currentSegment.replace(toReplace, replacement); segmentStart = pos; } } inQuotes = !inQuotes; } // else the char just becomes part of the current segment } return updatedValue; } static XPathParts parts = new XPathParts(); private String addIndexInfo(PrintWriter index, String path, String transID) { parts.set(path); Map<String, String> attributes = parts.findAttributes("transform"); if (attributes == null) return null; // error, not a transform file String source = attributes.get("source"); String target = attributes.get("target"); String variant = attributes.get("variant"); String direction = attributes.get("direction"); String alias = attributes.get("alias"); String backwardAlias = attributes.get("backwardAlias"); String visibility = attributes.get("visibility"); String status = "internal".equals(visibility) ? "internal" : "file"; fileCount++; String id = source + "-" + target; String rid = target + "-" + source; String filename = source + "_" + target; if (variant != null) { id += "/" + variant; rid += "/" + variant; filename += "_" + variant; } filename += ".txt"; if (direction.equals("both") || direction.equals("forward")) { if (verbose) { System.out.println(" " + id + " " + filename + " " + "FORWARD"); } if (alias != null) { for (String ali : alias.trim().split("\\s+")) { addAlias(index, ali, id); } } index.println(" " + id + " {"); index.println(" " + status + " {"); index.println(" resource:process(transliterator) {\"" + filename + "\"}"); index.println(" direction {\"FORWARD\"}"); index.println(" }"); index.println(" }"); } if (direction.equals("both") || direction.equals("backward")) { if (verbose) { System.out.println(" " + rid + " " + filename + " " + "REVERSE"); } if (backwardAlias != null) { for (String bali : backwardAlias.trim().split("\\s+")) { addAlias(index, bali, rid); } } index.println(" " + rid + " {"); index.println(" " + status + " {"); index.println(" resource:process(transliterator) {\"" + filename + "\"}"); index.println(" direction {\"REVERSE\"}"); index.println(" }"); index.println(" }"); } index.println(); return filename; } void addAlias( PrintWriter index, String aliasSource, String aliasTarget, String aliasVariant, String originalSource, String originalTarget, String originalVariant) { // Spacedhan-Han { // alias {"null"} // } addAlias( index, getName(aliasSource, aliasTarget, aliasVariant), getName(originalSource, originalTarget, originalVariant)); } private void addAlias(PrintWriter index, String alias, String original) { index.println(" " + alias + " {"); index.println(" alias" + " {\"" + original + "\"}"); index.println(" }"); } String getName(String source, String target, String variant) { String id = source + "-" + target; if (variant != null && variant.length() != 0) { id += "/" + variant; } return id; } private void doHeader(PrintWriter output, String quoteSymbol, String filename) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy"); output.print('\uFEFF'); output.println( quoteSymbol + " ***************************************************************************"); output.println(quoteSymbol + " *"); output.println( quoteSymbol + " * Copyright (C) 2004-" + sdf.format(new Date()) + ", International Business Machines"); output.println( quoteSymbol + " * Corporation; Unicode, Inc.; and others. All Rights Reserved."); output.println(quoteSymbol + " *"); output.println( quoteSymbol + " ***************************************************************************"); output.println(quoteSymbol + " File: " + filename); output.println(quoteSymbol + " Generated from CLDR "); output.println(quoteSymbol + ""); } public void processArgs(String[] args) { UOption.parseArgs(args, options); if (options[HELP1].doesOccur || options[HELP2].doesOccur) { System.out.println(HELP_TEXT1); return; } String sourceDir = options[SOURCEDIR].value; // Utility.COMMON_DIRECTORY + "transforms/"; String targetDir = options[DESTDIR].value; // Utility.GEN_DIRECTORY + "main/"; String match = options[MATCH].value; skipComments = options[SKIP_COMMENTS].doesOccur; writeIndex = options[WRITE_INDEX].doesOccur; verbose = options[VERBOSE].doesOccur; try { if (writeIndex) { throw new InternalError("writeIndex not implemented."); } else { ElapsedTimer et = new ElapsedTimer(); writeTransforms(sourceDir, match, targetDir + File.separator); System.out.println("ConvertTransforms: wrote " + fileCount + " files in " + et); } } catch (IOException ex) { RuntimeException e = new RuntimeException(); e.initCause(ex.getCause()); throw e; } finally { System.out.println("DONE"); } } // fixData ONLY NEEDED TO FIX FILE PROBLEM /* * private void fixData(String inputDirectory, String matchingPattern, String outputDirectory) throws IOException { * File dir = new File(inputDirectory); * File[] files = dir.listFiles(); * for (int i = 0; i < files.length; ++i) { * if (files[i].isDirectory()) continue; * BufferedReader input = BagFormatter.openUTF8Reader("", files[i].getCanonicalPath()); * PrintWriter output = BagFormatter.openUTF8Writer("", outputDirectory + files[i].getName()); * while (true) { * String line = input.readLine(); * if (line == null) break; * if (line.indexOf("DOCTYPE") >= 0) { * line = line.replaceAll(" ldml ", " supplementalData "); * } * output.println(line); * } * input.close(); * output.close(); * } * } */ }