public void TestJamo() throws IOException {
    {
      // CLDRTransforms.registerCldrTransforms(null, ".*(Jamo).*", out);
      String name = "Latin-ConjoiningJamo";
      Transliterator fromLatin = Transliterator.getInstance(name);
      Transliterator toLatin = Transliterator.getInstance(name, Transliterator.REVERSE);
      UnicodeSet sourceSet = getRepresentativeHangul();
      logln(sourceSet.size() + "\t" + sourceSet.toPattern(false));

      Transliterator nfd = Transliterator.getInstance("nfd");

      UnicodeSet multiply = new UnicodeSet(sourceSet);
      // for (UnicodeSetIterator it = new UnicodeSetIterator(sourceSet); it.next();) {
      // for (UnicodeSetIterator it2 = new UnicodeSetIterator(sourceSet); it2.next();) {
      // String source1 = it.getString() + it2.getString(); // try all combinations.
      // multiply.add(source1);
      // }
      // }

      // latin.addAll(toTarget.getSourceSet())
      // .addAll(toTarget.getTargetSet())
      // .addAll(fromTarget.getSourceSet())
      // .addAll(fromTarget.getTargetSet());
      // latin.retainAll(new UnicodeSet("[[:latin:][:common:][:inherited:]]"));

      // Transliterator.DEBUG = true;

      UnicodeSet specials = null; // new UnicodeSet("[{ch}]");
      writeFile(name, multiply, nfd, toLatin, fromLatin, true, null, specials);
    }
  }
 private static String pretty(String source) {
   StringBuilder result = new StringBuilder();
   for (int i = 0; i < source.length(); ++i) {
     char c = source.charAt(i);
     String color =
         lead.contains(c)
             ? "FFcccc"
             : vowel.contains(c) ? "ccFFcc" : trail.contains(c) ? "ccccFF" : "FFFFFF";
     result.append("<span style='background-color: #" + color + "'>" + c + "</span>");
   }
   return result.toString();
 }
Beispiel #3
0
  /**
   * Converts the char set cset into a Unicode set uset. Equivalent to charSetToUSet.
   *
   * @param set Set of 256 bit flags corresponding to a set of chars.
   * @param uset USet to receive characters. Existing contents are deleted.
   */
  private void convert(int set[], UnicodeSet uset) {
    uset.clear();
    if (!initNameSetsLengths()) {
      return;
    }

    // build a char string with all chars that are used in character names
    for (char c = 255; c > 0; c--) {
      if (contains(set, c)) {
        uset.add(c);
      }
    }
  }
  private static UnicodeSet getRepresentativeBoundaryHangul() {
    UnicodeSet resultToAddTo = new UnicodeSet();
    // U+1100 ( ᄀ ) HANGUL CHOSEONG KIYEOK
    // U+1161 ( ᅡ ) HANGUL JUNGSEONG A
    UnicodeSet L = new UnicodeSet("[:hst=L:]");
    UnicodeSet V = new UnicodeSet("[:hst=V:]");
    UnicodeSet T = new UnicodeSet("[:hst=T:]");

    String prefixLV = "\u1100\u1161";
    String prefixL = "\u1100";
    String suffixV = "\u1161";
    String nullL = "\u110B";

    UnicodeSet L0 = new UnicodeSet("[\u1100\u110B]");

    // do all combinations of L0 + V + nullL + V

    for (UnicodeSetIterator iL0 = new UnicodeSetIterator(L0); iL0.next(); ) {
      for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) {
        for (UnicodeSetIterator iV2 = new UnicodeSetIterator(V); iV2.next(); ) {
          String sample = iL0.getString() + iV.getString() + nullL + iV2.getString();
          String trial = Normalizer.compose(sample, false);
          if (trial.length() == 2) {
            resultToAddTo.add(trial);
          }
        }
      }
    }

    for (UnicodeSetIterator iL = new UnicodeSetIterator(L); iL.next(); ) {
      // do all combinations of "g" + V + L + "a"
      final String suffix = iL.getString() + suffixV;
      for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next(); ) {
        String sample = prefixL + iV.getString() + suffix;
        String trial = Normalizer.compose(sample, false);
        if (trial.length() == 2) {
          resultToAddTo.add(trial);
        }
      }
      // do all combinations of "ga" + T + L + "a"
      for (UnicodeSetIterator iT = new UnicodeSetIterator(T); iT.next(); ) {
        String sample = prefixLV + iT.getString() + suffix;
        String trial = Normalizer.compose(sample, false);
        if (trial.length() == 2) {
          resultToAddTo.add(trial);
        }
      }
    }
    return resultToAddTo;
  }
  public static UnicodeSet getRepresentativeHangul() {
    UnicodeSet extraSamples =
        new UnicodeSet(
            "[\uCE20{\uAD6C\uB514}{\uAD73\uC774}{\uBB34\uB837}{\uBB3C\uC5FF}{\uC544\uAE4C}{\uC544\uB530}{\uC544\uBE60}{\uC544\uC2F8}{\uC544\uC9DC}{\uC544\uCC28}{\uC545\uC0AC}{\uC545\uC2F8}{\uC546\uCE74}{\uC548\uAC00}{\uC548\uC790}{\uC548\uC9DC}{\uC548\uD558}{\uC54C\uAC00}{\uC54C\uB530}{\uC54C\uB9C8}{\uC54C\uBC14}{\uC54C\uBE60}{\uC54C\uC0AC}{\uC54C\uC2F8}{\uC54C\uD0C0}{\uC54C\uD30C}{\uC54C\uD558}{\uC555\uC0AC}{\uC555\uC2F8}{\uC558\uC0AC}{\uC5C5\uC12F\uC501}{\uC5C6\uC5C8\uC2B5}]");
    UnicodeSet sourceSet = new UnicodeSet();
    addRepresentativeHangul(sourceSet, 2, false);
    addRepresentativeHangul(sourceSet, 3, false);
    addRepresentativeHangul(sourceSet, 2, true);
    addRepresentativeHangul(sourceSet, 3, true);
    // add the boundary cases; we want an example of each case of V + L and one example of each case
    // of T+L

    UnicodeSet more = getRepresentativeBoundaryHangul();
    sourceSet.addAll(more);
    sourceSet.addAll(extraSamples);
    return sourceSet;
  }
 private static void addRepresentativeHangul(
     UnicodeSet resultToAddTo, int leng, boolean noFirstConsonant) {
   UnicodeSet notYetSeen = new UnicodeSet();
   for (char c = '\uAC00'; c < '\uD7AF'; ++c) {
     String charStr = String.valueOf(c);
     String decomp = Normalizer.decompose(charStr, false);
     if (decomp.length() != leng) {
       continue; // only take one length at a time
     }
     if (decomp.startsWith("ᄋ") != noFirstConsonant) {
       continue;
     }
     if (!notYetSeen.containsAll(decomp)) {
       resultToAddTo.add(c);
       notYetSeen.addAll(decomp);
     }
   }
 }
 private static int checkLatin(PrintWriter out, Transliterator fromLatin, Transliterator toLatin) {
   int errorCount = 0;
   for (UnicodeSetIterator it = new UnicodeSetIterator(latin); it.next(); ) {
     String source = it.getString();
     String to = fromLatin.transliterate(source);
     if (latin.containsSome(to)) {
       String from = toLatin.transliterate(to);
       String backto = toLatin.transliterate(from);
       errorCount += showItems(out, false, source, to, from, backto);
     }
   }
   return errorCount;
 }
  @Override
  public int next() {
    int current = current();
    int next = rules.next();
    if (next == BreakIterator.DONE) return next;
    else next += workingOffset;

    char c = working.current();
    int following = rules.next(); // lookahead
    if (following != BreakIterator.DONE) {
      following += workingOffset;
      if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
        workingOffset = next - 1;
        working.setText(
            text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
        return next - 1;
      }
      rules.previous(); // undo the lookahead
    }

    return next;
  }
 static {
   laoSet = new UnicodeSet("[:Lao:]");
   laoSet.compact();
   laoSet.freeze();
 }