public static UnicodeSet getRepresentativeHangul() { UnicodeSet extraSamples = new UnicodeSet( "[\uCE20{\uAD6C\uB514}{\uAD73\uC774}{\uBB34\uB837}{\uBB3C\uC5FF}{\uC544\uAE4C}{\uC544\uB530}{\uC544\uBE60}{\uC544\uC2F8}{\uC544\uC9DC}{\uC544\uCC28}{\uC545\uC0AC}{\uC545\uC2F8}{\uC546\uCE74}{\uC548\uAC00}{\uC548\uC790}{\uC548\uC9DC}{\uC548\uD558}{\uC54C\uAC00}{\uC54C\uB530}{\uC54C\uB9C8}{\uC54C\uBC14}{\uC54C\uBE60}{\uC54C\uC0AC}{\uC54C\uC2F8}{\uC54C\uD0C0}{\uC54C\uD30C}{\uC54C\uD558}{\uC555\uC0AC}{\uC555\uC2F8}{\uC558\uC0AC}{\uC5C5\uC12F\uC501}{\uC5C6\uC5C8\uC2B5}]"); UnicodeSet sourceSet = new UnicodeSet(); addRepresentativeHangul(sourceSet, 2, false); addRepresentativeHangul(sourceSet, 3, false); addRepresentativeHangul(sourceSet, 2, true); addRepresentativeHangul(sourceSet, 3, true); // add the boundary cases; we want an example of each case of V + L and one example of each case // of T+L UnicodeSet more = getRepresentativeBoundaryHangul(); sourceSet.addAll(more); sourceSet.addAll(extraSamples); return sourceSet; }
private static void addRepresentativeHangul( UnicodeSet resultToAddTo, int leng, boolean noFirstConsonant) { UnicodeSet notYetSeen = new UnicodeSet(); for (char c = '\uAC00'; c < '\uD7AF'; ++c) { String charStr = String.valueOf(c); String decomp = Normalizer.decompose(charStr, false); if (decomp.length() != leng) { continue; // only take one length at a time } if (decomp.startsWith("ᄋ") != noFirstConsonant) { continue; } if (!notYetSeen.containsAll(decomp)) { resultToAddTo.add(c); notYetSeen.addAll(decomp); } } }