public void TestJamo() throws IOException { { // CLDRTransforms.registerCldrTransforms(null, ".*(Jamo).*", out); String name = "Latin-ConjoiningJamo"; Transliterator fromLatin = Transliterator.getInstance(name); Transliterator toLatin = Transliterator.getInstance(name, Transliterator.REVERSE); UnicodeSet sourceSet = getRepresentativeHangul(); logln(sourceSet.size() + "\t" + sourceSet.toPattern(false)); Transliterator nfd = Transliterator.getInstance("nfd"); UnicodeSet multiply = new UnicodeSet(sourceSet); // for (UnicodeSetIterator it = new UnicodeSetIterator(sourceSet); it.next();) { // for (UnicodeSetIterator it2 = new UnicodeSetIterator(sourceSet); it2.next();) { // String source1 = it.getString() + it2.getString(); // try all combinations. // multiply.add(source1); // } // } // latin.addAll(toTarget.getSourceSet()) // .addAll(toTarget.getTargetSet()) // .addAll(fromTarget.getSourceSet()) // .addAll(fromTarget.getTargetSet()); // latin.retainAll(new UnicodeSet("[[:latin:][:common:][:inherited:]]")); // Transliterator.DEBUG = true; UnicodeSet specials = null; // new UnicodeSet("[{ch}]"); writeFile(name, multiply, nfd, toLatin, fromLatin, true, null, specials); } }
private static String pretty(String source) { StringBuilder result = new StringBuilder(); for (int i = 0; i < source.length(); ++i) { char c = source.charAt(i); String color = lead.contains(c) ? "FFcccc" : vowel.contains(c) ? "ccFFcc" : trail.contains(c) ? "ccccFF" : "FFFFFF"; result.append("<span style='background-color: #" + color + "'>" + c + "</span>"); } return result.toString(); }
/** * Converts the char set cset into a Unicode set uset. Equivalent to charSetToUSet. * * @param set Set of 256 bit flags corresponding to a set of chars. * @param uset USet to receive characters. Existing contents are deleted. */ private void convert(int set[], UnicodeSet uset) { uset.clear(); if (!initNameSetsLengths()) { return; } // build a char string with all chars that are used in character names for (char c = 255; c > 0; c--) { if (contains(set, c)) { uset.add(c); } } }
private static UnicodeSet getRepresentativeBoundaryHangul() { UnicodeSet resultToAddTo = new UnicodeSet(); // U+1100 ( ᄀ ) HANGUL CHOSEONG KIYEOK // U+1161 ( ᅡ ) HANGUL JUNGSEONG A UnicodeSet L = new UnicodeSet("[:hst=L:]"); UnicodeSet V = new UnicodeSet("[:hst=V:]"); UnicodeSet T = new UnicodeSet("[:hst=T:]"); String prefixLV = "\u1100\u1161"; String prefixL = "\u1100"; String suffixV = "\u1161"; String nullL = "\u110B"; UnicodeSet L0 = new UnicodeSet("[\u1100\u110B]"); // do all combinations of L0 + V + nullL + V for (UnicodeSetIterator iL0 = new UnicodeSetIterator(L0); iL0.next(); ) { for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) { for (UnicodeSetIterator iV2 = new UnicodeSetIterator(V); iV2.next(); ) { String sample = iL0.getString() + iV.getString() + nullL + iV2.getString(); String trial = Normalizer.compose(sample, false); if (trial.length() == 2) { resultToAddTo.add(trial); } } } } for (UnicodeSetIterator iL = new UnicodeSetIterator(L); iL.next(); ) { // do all combinations of "g" + V + L + "a" final String suffix = iL.getString() + suffixV; for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) { String sample = prefixL + iV.getString() + suffix; String trial = Normalizer.compose(sample, false); if (trial.length() == 2) { resultToAddTo.add(trial); } } // do all combinations of "ga" + T + L + "a" for (UnicodeSetIterator iT = new UnicodeSetIterator(T); iT.next(); ) { String sample = prefixLV + iT.getString() + suffix; String trial = Normalizer.compose(sample, false); if (trial.length() == 2) { resultToAddTo.add(trial); } } } return resultToAddTo; }
public static UnicodeSet getRepresentativeHangul() { UnicodeSet extraSamples = new UnicodeSet( "[\uCE20{\uAD6C\uB514}{\uAD73\uC774}{\uBB34\uB837}{\uBB3C\uC5FF}{\uC544\uAE4C}{\uC544\uB530}{\uC544\uBE60}{\uC544\uC2F8}{\uC544\uC9DC}{\uC544\uCC28}{\uC545\uC0AC}{\uC545\uC2F8}{\uC546\uCE74}{\uC548\uAC00}{\uC548\uC790}{\uC548\uC9DC}{\uC548\uD558}{\uC54C\uAC00}{\uC54C\uB530}{\uC54C\uB9C8}{\uC54C\uBC14}{\uC54C\uBE60}{\uC54C\uC0AC}{\uC54C\uC2F8}{\uC54C\uD0C0}{\uC54C\uD30C}{\uC54C\uD558}{\uC555\uC0AC}{\uC555\uC2F8}{\uC558\uC0AC}{\uC5C5\uC12F\uC501}{\uC5C6\uC5C8\uC2B5}]"); UnicodeSet sourceSet = new UnicodeSet(); addRepresentativeHangul(sourceSet, 2, false); addRepresentativeHangul(sourceSet, 3, false); addRepresentativeHangul(sourceSet, 2, true); addRepresentativeHangul(sourceSet, 3, true); // add the boundary cases; we want an example of each case of V + L and one example of each case // of T+L UnicodeSet more = getRepresentativeBoundaryHangul(); sourceSet.addAll(more); sourceSet.addAll(extraSamples); return sourceSet; }
private static void addRepresentativeHangul( UnicodeSet resultToAddTo, int leng, boolean noFirstConsonant) { UnicodeSet notYetSeen = new UnicodeSet(); for (char c = '\uAC00'; c < '\uD7AF'; ++c) { String charStr = String.valueOf(c); String decomp = Normalizer.decompose(charStr, false); if (decomp.length() != leng) { continue; // only take one length at a time } if (decomp.startsWith("ᄋ") != noFirstConsonant) { continue; } if (!notYetSeen.containsAll(decomp)) { resultToAddTo.add(c); notYetSeen.addAll(decomp); } } }
private static int checkLatin(PrintWriter out, Transliterator fromLatin, Transliterator toLatin) { int errorCount = 0; for (UnicodeSetIterator it = new UnicodeSetIterator(latin); it.next(); ) { String source = it.getString(); String to = fromLatin.transliterate(source); if (latin.containsSome(to)) { String from = toLatin.transliterate(to); String backto = toLatin.transliterate(from); errorCount += showItems(out, false, source, to, from, backto); } } return errorCount; }
@Override public int next() { int current = current(); int next = rules.next(); if (next == BreakIterator.DONE) return next; else next += workingOffset; char c = working.current(); int following = rules.next(); // lookahead if (following != BreakIterator.DONE) { following += workingOffset; if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) { workingOffset = next - 1; working.setText( text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset); return next - 1; } rules.previous(); // undo the lookahead } return next; }
static { laoSet = new UnicodeSet("[:Lao:]"); laoSet.compact(); laoSet.freeze(); }