public void TestJamo() throws IOException {
    {
      // CLDRTransforms.registerCldrTransforms(null, ".*(Jamo).*", out);
      String name = "Latin-ConjoiningJamo";
      Transliterator fromLatin = Transliterator.getInstance(name);
      Transliterator toLatin = Transliterator.getInstance(name, Transliterator.REVERSE);
      UnicodeSet sourceSet = getRepresentativeHangul();
      logln(sourceSet.size() + "\t" + sourceSet.toPattern(false));

      Transliterator nfd = Transliterator.getInstance("nfd");

      UnicodeSet multiply = new UnicodeSet(sourceSet);
      // for (UnicodeSetIterator it = new UnicodeSetIterator(sourceSet); it.next();) {
      // for (UnicodeSetIterator it2 = new UnicodeSetIterator(sourceSet); it2.next();) {
      // String source1 = it.getString() + it2.getString(); // try all combinations.
      // multiply.add(source1);
      // }
      // }

      // latin.addAll(toTarget.getSourceSet())
      // .addAll(toTarget.getTargetSet())
      // .addAll(fromTarget.getSourceSet())
      // .addAll(fromTarget.getTargetSet());
      // latin.retainAll(new UnicodeSet("[[:latin:][:common:][:inherited:]]"));

      // Transliterator.DEBUG = true;

      UnicodeSet specials = null; // new UnicodeSet("[{ch}]");
      writeFile(name, multiply, nfd, toLatin, fromLatin, true, null, specials);
    }
  }
  public File[] getJpegList(String userId, String slideName) {

    // ここ直してください。
    String appRootPath = new PropertiesComponent().referProperties("appRootPath");
    //		String appRootPath = "C:/Users/tanese kenta/awaretweet/";

    // 探索するパス
    String basePath = appRootPath + "slide/" + userId + "/" + slideName + "/";

    File file = new File(basePath);
    File[] files = file.listFiles();
    for (int i = 0; i < files.length; i++) {
      if (files[i].getName().startsWith(".")) {
        files[i].delete();
        return null;
      } else {
        Path src = Paths.get(basePath + files[i].getName());
        Transliterator transliterator = Transliterator.getInstance("Katakana-Latin");
        String reName = transliterator.transliterate(files[i].getName());
        Path srcRename = Paths.get(basePath + reName);
        try {
          Files.move(src, srcRename);
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }

    File[] renameFiles = new File(basePath).listFiles();

    Arrays.sort(renameFiles, new FileSort());
    return renameFiles;
  }
Exemple #3
0
  public static void main(String[] args) {
    int count = 0;
    Enumeration<String> targets = Transliterator.getAvailableIDs();
    while (targets.hasMoreElements()) {
      String s = (String) targets.nextElement();
      System.out.println(s);
      count++;
    }

    System.out.println("number " + count);
    // System.exit(0);
    // trans = Transliterator.getInstance("Any-en_US; nfd;
    // [\u0301\u0302\u0304\u0306\u0307\u0308\u030c\u0328] remove; nfc"); // [:nonspacing mark:]
    // remove; nfc");

    trans = Transliterator.getInstance("Any-Latin"); // [:nonspacing mark:] remove; nfc");
    decomposed = Transliterator.getInstance("Any-Latin; nfd"); // [:nonspacing mark:] remove; nfc");

    for (int row = 0; row < 256; row++) {
      String name = String.format("row%02x.trans", row);
      PrintWriter out = null;
      try {
        out = new PrintWriter(new FileWriter(name));
        printRow(out, row);
      } catch (IOException e) {
        System.out.println("Could not open " + name + " for write");
      } catch (UselessException e) {
        // System.out.println("Deleting " + name);
        File f = new File(name);
        f.delete();
      } finally {
        Utils.closeFile(out);
      }
    }
  }
Exemple #4
0
 /**
  * Clean string.
  *
  * @param str the str
  * @return the string
  */
 public static String cleanString(String str) {
   Transliterator accentsconverter =
       Transliterator.getInstance("Latin; NFD; [:Nonspacing Mark:] Remove; NFC;");
   str = accentsconverter.transliterate(str);
   // the character ? seems to not be changed to d by the transliterate
   // function
   StringBuffer cleanedStr = new StringBuffer(str.trim());
   // delete special character
   for (int i = 0; i < cleanedStr.length(); i++) {
     char c = cleanedStr.charAt(i);
     if (c == ' ') {
       if (i > 0 && cleanedStr.charAt(i - 1) == '-') {
         cleanedStr.deleteCharAt(i--);
       } else {
         c = '-';
         cleanedStr.setCharAt(i, c);
       }
       continue;
     }
     if (i > 0 && !(Character.isLetterOrDigit(c) || c == '-')) {
       cleanedStr.deleteCharAt(i--);
       continue;
     }
     if (i > 0 && c == '-' && cleanedStr.charAt(i - 1) == '-') cleanedStr.deleteCharAt(i--);
   }
   return cleanedStr.toString().toLowerCase();
 }
 private void assertRoundTripTransform(
     String message, String source, Transliterator lh, Transliterator hl) {
   String to = hl.transform(source);
   String back = lh.transform(to);
   String to2 = hl.transform(source.replaceAll("(.)", "$1 ").trim());
   String to3 = hl.transform(back.replaceAll("(.)", "$1 ").trim());
   assertEquals(message + " " + source + " [" + to + "/" + to2 + "/" + to3 + "]", source, back);
 }
 private String normalizeToken(final String searchToken) {
   if (TransliteratorManager.init(null)) {
     final Transliterator normalizer = normalizer();
     return normalizer.transliterate(searchToken);
   } else {
     // Do our best since the Transliterators aren't up yet.
     return searchToken.toLowerCase();
   }
 }
  public void TestHangul2() {
    // CLDRTransforms.registerCldrTransforms(null, ".*(Hangul|Jamo).*", out);

    Transliterator lh = Transliterator.getInstance("Latin-Hangul");
    Transliterator hl = lh.getInverse();

    // assertRoundTripTransform("Transform", "\uAC0D\uD0C0", lh, hl);
    // assertRoundTripTransform("Transform", "\uAC0D\uB530", lh, hl);

    final UnicodeSet representativeHangul = getRepresentativeHangul();
    for (UnicodeSetIterator it = new UnicodeSetIterator(representativeHangul); it.next(); ) {
      assertRoundTripTransform("Transform", it.getString(), lh, hl);
    }

    assertTransform("Transform", "츠", lh, "ch");

    assertTransform("Transform", "아따", lh, hl, "atta", "a-tta");
    assertTransform("Transform", "아빠", lh, hl, "appa", "a-ppa");
    assertTransform("Transform", "아짜", lh, hl, "ajja", "a-jja");
    assertTransform("Transform", "아까", lh, hl, "akka", "a-kka");
    assertTransform("Transform", "아싸", lh, hl, "assa", "a-ssa");
    assertTransform("Transform", "아차", lh, hl, "acha", "a-cha");
    assertTransform("Transform", "악사", lh, hl, "agsa", "ag-sa");
    assertTransform("Transform", "안자", lh, hl, "anja", "an-ja");
    assertTransform("Transform", "안하", lh, hl, "anha", "an-ha");
    assertTransform("Transform", "알가", lh, hl, "alga", "al-ga");
    assertTransform("Transform", "알마", lh, hl, "alma", "al-ma");
    assertTransform("Transform", "알바", lh, hl, "alba", "al-ba");
    assertTransform("Transform", "알사", lh, hl, "alsa", "al-sa");
    assertTransform("Transform", "알타", lh, hl, "alta", "al-ta");
    assertTransform("Transform", "알파", lh, hl, "alpa", "al-pa");
    assertTransform("Transform", "알하", lh, hl, "alha", "al-ha");
    assertTransform("Transform", "압사", lh, hl, "absa", "ab-sa");
    assertTransform("Transform", "안가", lh, hl, "anga", "an-ga");
    assertTransform("Transform", "악싸", lh, hl, "agssa", "ag-ssa");
    assertTransform("Transform", "안짜", lh, hl, "anjja", "an-jja");
    assertTransform("Transform", "알싸", lh, hl, "alssa", "al-ssa");
    assertTransform("Transform", "알따", lh, hl, "altta", "al-tta");
    assertTransform("Transform", "알빠", lh, hl, "alppa", "al-ppa");
    assertTransform("Transform", "압싸", lh, hl, "abssa", "ab-ssa");
    assertTransform("Transform", "앆카", lh, hl, "akkka", "akk-ka");
    assertTransform("Transform", "았사", lh, hl, "asssa", "ass-sa");

    // 1. Latin->Hangul transliterator maps 'ch' to '킇' (splitting the sequence
    // into
    // 'c' and 'h' and inserting an implicit vowel 'ㅡ'). It'd be better to map a
    // *stand-alone* 'ch' to '츠'
    //
    // 2. As mentioned in http://www.unicode.org/cldr/transliteration_guidelines.html
    // (Korean section),
    //
    // - altta = alt-ta 앑타 should be ' al-tta 알따'
    //
    // - alppa = alp-pa 앒파 : should be 'al-ppa 알빠'

  }
 private static int checkLatin(PrintWriter out, Transliterator fromLatin, Transliterator toLatin) {
   int errorCount = 0;
   for (UnicodeSetIterator it = new UnicodeSetIterator(latin); it.next(); ) {
     String source = it.getString();
     String to = fromLatin.transliterate(source);
     if (latin.containsSome(to)) {
       String from = toLatin.transliterate(to);
       String backto = toLatin.transliterate(from);
       errorCount += showItems(out, false, source, to, from, backto);
     }
   }
   return errorCount;
 }
 public IcuTransformTokenFilterFactory(
     IndexSettings indexSettings, Environment environment, String name, Settings settings) {
   super(indexSettings, name, settings);
   this.id = settings.get("id", "Null");
   String s = settings.get("dir", "forward");
   this.dir = "forward".equals(s) ? Transliterator.FORWARD : Transliterator.REVERSE;
   this.transliterator = Transliterator.getInstance(id, dir);
 }
 public void xTestTamil() throws IOException {
   {
     // CLDRTransforms.registerCldrTransforms(null, ".*(Tamil).*", out);
     String name = "Tamil-Devanagari";
     Transliterator tamil_devanagari = Transliterator.getInstance(name);
     Transliterator devanagari_tamil = Transliterator.getInstance(name, Transliterator.REVERSE);
     writeFile(
         name,
         new UnicodeSet("[[:block=tamil:]-[ௗ]]"),
         null,
         tamil_devanagari,
         devanagari_tamil,
         false,
         null,
         null);
   }
 }
 private static int checkString(
     PrintWriter out,
     String source1,
     Transliterator nfd,
     Transliterator fromLatin,
     Transliterator toLatin,
     int errorCount,
     String separator) {
   String source = nfd == null ? source1 : nfd.transliterate(source1);
   String to = toLatin.transliterate(source);
   String from = fromLatin.transliterate(to);
   if (separator == null) {
     final boolean bad = !source.equals(from);
     if (bad || verbose) {
       String backto = toLatin.transliterate(from);
       if (bad) {
         errorCount += 1;
       }
       showItems(out, false, source, to, from, backto, bad ? "FAIL" : null);
     } else {
       // showItems(out, source, to, from, "OK");
     }
   } else {
     if (to.contains(separator)) { // check separators, only put in when needed
       String otherTo = to.replace("-", "");
       String otherFrom = fromLatin.transliterate(otherTo);
       final boolean bad = otherFrom.equals(from);
       if (bad) {
         // String backto = toLatin.transliterate(from);
         errorCount += 1;
         showItems(out, false, source, to, from, otherTo, otherFrom, bad ? "FAIL" : null);
       }
     }
   }
   return errorCount;
 }
Exemple #12
0
  private static void printRow(PrintWriter out, int row) throws UselessException {
    out.println("#");
    out.println("# This is a table for transliterating characters.");
    out.println("# It was created using icu4j");
    out.println("#");
    out.println("# All resulting strings that contained characters outside the");
    out.println("# range of iso 8859-1 are commented out");
    out.println("#");
    out.println();

    int count = 0;
    for (int i = 0; i < 256; i++) {
      char c = (char) ((row << 8) + i);
      String single = "" + c;
      String result = trans.transliterate(single);

      if (result.length() == 1 && result.charAt(0) == c) result = "?";
      else count++;

      boolean inRange = true;
      for (char rc : result.toCharArray()) {
        if (rc > 0xff) {
          // System.out.printf("out of range result %c for row %d\n", rc, row);
          inRange = false;
          break;
        }
      }

      if (!inRange) {
        count--;
        out.print("#");
      }

      out.format("U+%02x%02x %-12.12s # Character %s", row, i, result, single);

      // if (!inRange) {
      //	String s = decomposed.transliterate(single);
      //	out.format(", %s", s);
      //	for (char rc : s.toCharArray()) {
      //		out.format(" %04x", (int) rc);
      //	}
      // }
      out.println();
    }

    if (count == 0) throw new UselessException();
  }
 public void TestChinese() {
   // CLDRTransforms.registerCldrTransforms(null, ".*(Han|Pinyin).*", out);
   Transliterator hanLatin = Transliterator.getInstance("Han-Latin");
   assertTransform("Transform", "zào Unicode", hanLatin, "造Unicode");
   assertTransform("Transform", "zài chuàng zào Unicode zhī qián", hanLatin, "在創造Unicode之前");
 }
 public static void main(String[] args) {
   /*x ShowTransliterations.1 */
   Enumeration<String> idEnum = Transliterator.getAvailableIDs();
   while (idEnum.hasMoreElements()) System.out.println(idEnum.nextElement());
   /*x*/
 }
 /** Deferred initialization because it can be slow. */
 public synchronized Transliterator normalizer() {
   if (normalizer == null) {
     normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
   }
   return normalizer;
 }
/** Class for detecting and converting Zawgyi-encoded data. */
public class MyanmarZawgyiConverter {
  // For detecting if Myanmar text is encoded with Zawgyi vs. Unicode characters.

  private static final Pattern ZAWGYI_DETECT_PATTERN =
      PatternCache.get(
          // A regular expression matched if text is Zawgyi encoding.
          // Using the ranges 1033-1034 or 1060-1097 will report Shan, Karen,
          // etc. as Zawgyi.
          "[\u105a\u1060-\u1097]|" // Zawgyi characters outside Unicode range
              + "[\u1033\u1034]|" // These are Mon characters
              + "\u1031\u108f|"
              + "\u1031[\u103b-\u103e]|" // Medial right after \u1031
              + "[\u102b-\u1030\u1032]\u1031|" // Vowel sign right after before \u1031
              + " \u1031| \u103b|" // Unexpected characters after a space
              + "^\u1031|^\u103b|\u1038\u103b|\u1038\u1031|"
              + "[\u102d\u102e\u1032]\u103b|\u1039[^\u1000-\u1021]|\u1039$"
              + "|\u1004\u1039[\u1001-\u102a\u103f\u104e]" // Missing ASAT in Kinzi
              + "|\u1039[^u1000-\u102a\u103f\u104e]" // 1039 not before a consonant
              // Out of order medials
              + "|\u103c\u103b|\u103d\u103b"
              + "|\u103e\u103b|\u103d\u103c"
              + "|\u103e\u103c|\u103e\u103d"
              // Bad medial combos
              + "|\u103b\u103c"
              // Out of order vowel signs
              + "|[\u102f\u1030\u102b\u102c][\u102d\u102e\u1032]"
              + "|[\u102b\u102c][\u102f\u102c]"
              // Digit before diacritic
              + "|[\u1040-\u1049][\u102b-\u103e\u102b-\u1030\u1032\u1036\u1037\u1038\u103a]"
              // Single digit 0, 7 at start
              + "|^[\u1040\u1047][^\u1040-\u1049]"
              // Second 1039 with bad followers
              + "|[\u1000-\u102a\u103f\u104e]\u1039[\u101a\u101b\u101d\u101f\u1022-\u103f]"
              // Other bad combos.
              + "|\u103a\u103e"
              + "|\u1036\u102b]"
              // multiple upper vowels
              + "|\u102d[\u102e\u1032]|\u102e[\u102d\u1032]|\u1032[\u102d\u102e]"
              // Multiple lower vowels
              + "|\u102f\u1030|\u1030\u102f"
              // Multiple A vowels
              + "|\u102b\u102c|\u102c\u102b"
              // Shan digits with vowels or medials or other signs
              + "|[\u1090-\u1099][\u102b-\u1030\u1032\u1037\u103a-\u103e]"
              // Isolated Shan digit
              + "|[\u1000-\u10f4][\u1090-\u1099][\u1000-\u104f]"
              + "|^[\u1090-\u1099][\u1000-\u102a\u103f\u104e\u104a\u104b]"
              + "|[\u1000-\u104f][\u1090-\u1099]$"
              // Diacritics with non-Burmese vowel signs
              + "|[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074\u1082-\u108d"
              + "\u108f\u109a-\u109d]"
              + "[\u102b-\u103e]"
              // Consonant 103a + some vowel signs
              + "|[\u1000-\u102a]\u103a[\u102d\u102e\u1032]"
              // 1031 after other vowel signs
              + "|[\u102b-\u1030\u1032\u1036-\u1038\u103a]\u1031"
              // Using Shan combining characters with other languages.
              + "|[\u1087-\u108d][\u106e-\u1070\u1072-\u1074]"
              // Non-Burmese diacritics at start, following space, or following sections
              + "|^[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074"
              + "\u1082-\u108d\u108f\u109a-\u109d]"
              + "|[\u0020\u104a\u104b][\u105e-\u1060\u1062-\u1064\u1067-\u106d"
              + "\u1071-\u1074\u1082-\u108d\u108f\u109a-\u109d]"
              // Wrong order with 1036
              + "|[\u1036\u103a][\u102d-\u1030\u1032]"
              // Odd stacking
              + "|[\u1025\u100a]\u1039"
              // More mixing of non-Burmese languages
              + "|[\u108e-\u108f][\u1050-\u108d]"
              // Bad diacritic combos.
              + "|\u102d-\u1030\u1032\u1036-\u1037]\u1039]"
              // Dot before subscripted consonant
              + "|[\u1000-\u102a\u103f\u104e]\u1037\u1039"
              // Odd subscript + vowel signs
              + "|[\u1000-\u102a\u103f\u104e]\u102c\u1039[\u1000-\u102a\u103f\u104e]"
              // Medials after vowels
              + "|[\u102b-\u1030\u1032][\u103b-\u103e]"
              // Medials
              + "|\u1032[\u103b-\u103e]"
              // Medial with 101b
              + "|\u101b\u103c"
              // Stacking too deeply: consonant 1039 consonant 1039 consonant
              + "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]\u1039"
              + "[\u1000-\u102a\u103f\u104e]"
              // Stacking pattern consonant 1039 consonant 103a other vowel signs
              + "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]"
              + "[\u102b\u1032\u103d]"
              // Odd stacking over u1021, u1019, and u1000
              + "|[\u1000\u1005\u100f\u1010\u1012\u1014\u1015\u1019\u101a]\u1039\u1021"
              + "|[\u1000\u1010]\u1039\u1019"
              + "|\u1004\u1039\u1000"
              + "|\u1015\u1039[\u101a\u101e]"
              + "|\u1000\u1039\u1001\u1036"
              + "|\u1039\u1011\u1032"
              // Vowel sign in wrong order
              + "|\u1037\u1032"
              + "|\u1036\u103b"
              // Duplicated vowel
              + "|\u102f\u102f");

  // Transliteration to convert Burmese text in Zawgyi-encoded string to
  // standard Unicode codepoints and ordering.
  static final Transform<String, String> zawgyiUnicodeTransliterator =
      // Transliteration rules, 07-Jan-2014.
      Transliterator.createFromRules(
          "zawgyi-unicode",
          // Modern Burmese digits & Unicode code points.
          "$nondigits = [^\u1040-\u1049];"
              + "$space = ' ';"
              + "$consonant = [\u1000-\u1021];"
              + "$vowelsign = [\u102B-\u1030\u1032];"
              + "$umedial = [\u103B-\u103E];"
              + "$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F];"
              + "$ukinzi = \u1004\u103A\u1039;"
              + "$zmedialra = [\u103B\u107E-\u1084];"
              // #### STAGE (1): CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
              + "($consonant) \u103A \u1064 > $ukinzi $1 \u103B;"
              + "($consonant) \u1064 > $ukinzi $1;"
              + "\u1064 > $ukinzi;"
              + "($consonant) \u108b > $ukinzi $1 \u102D;"
              + "($consonant) \u108C > $ukinzi $1 \u102E;"
              + "($consonant) \u108D > $ukinzi $1 \u1036;"
              + "($consonant) \u103A \u1033 \u108B > $ukinzi $1 \u103B \u102D \u102F;"
              + "($consonant) \u103A \u108b > $ukinzi $1 \u103B \u102D ;"
              + "($consonant) \u103A \u108C \u1033 > $ukinzi $1 \u103B \u102E \u102F;"
              + "($consonant) \u103A \u108C > $ukinzi $1 \u103B \u102E ;"
              + "($consonant) \u103A \u108D > $ukinzi $1 \u103B \u1036 ;"
              + "($consonant) \u103A \u108e > $1 \u103B \u102D \u1036 ;"
              + "\u108B > $ukinzi \u102D ;"
              + "\u108C > $ukinzi \u102E ;"
              + "\u108D > $ukinzi \u1036 ;"
              + "\u106A ($vowelsign) \u1038 > \u1025 $1 \u1038 ;"
              + "\u106A > \u1009 ;"
              + "\u106B > \u100A ;"
              + "\u108F > \u1014 ;"
              + "\u1090 > \u101B ;"
              + "\u1086 > \u103F ;"
              + "\u103A > \u103B ;"
              + "\u107D > \u103B ;"
              + "\u103C \u108A > \u103D \u103E;"
              + "\u103C > \u103D ;"
              + "\u108A > \u103D \u103E ;"
              + "\u103D > \u103E ;"
              + "\u1087 > \u103E ;"
              + "\u1088 > \u103E \u102F ;"
              + "\u1089 > \u103E \u1030 ;"
              + "\u1039 > \u103A ;"
              + "\u1033 > \u102F ;"
              + "\u1034 > \u1030 ;"
              + "\u105A > \u102B \u103A ;"
              + "\u108E > \u102D \u1036 ;"
              + "\u1031 \u1094 ($consonant) \u103D > $1 \u103E \u1031 \u1037 ;"
              + "\u1094 > \u1037 ;"
              + "\u1095 > \u1037 ;"
              + "\u1025 \u1061 > \u1009 \u1039 \u1001;"
              + "\u1025 \u1062 > \u1009 \u1039 \u1002;"
              + "\u1025 \u1065 > \u1009 \u1039 \u1005;"
              + "\u1025 \u1068 > \u1009 \u1039 \u1007;"
              + "\u1025 \u1076 > \u1009 \u1039 \u1013;"
              + "\u1025 \u1078 > \u1009 \u1039 \u1015;"
              + "\u1025 \u107A > \u1009 \u1039 \u1017;"
              + "\u1025 \u1079 > \u1009 \u1039 \u1016;"
              + "\u1060 > \u1039 \u1000 ;"
              + "\u1061 > \u1039 \u1001 ;"
              + "\u1062 > \u1039 \u1002 ;"
              + "\u1063 > \u1039 \u1003 ;"
              + "\u1065 > \u1039 \u1005 ;"
              + "\u1066 > \u1039 \u1006 ;"
              + "\u1067 > \u1039 \u1006 ;"
              + "\u1068 > \u1039 \u1007 ;"
              + "\u1069 > \u1039 \u1008 ;"
              + "\u106C > \u1039 \u100B ;"
              + "\u106D > \u1039 \u100C ;"
              + "\u1070 > \u1039 \u100F ;"
              + "\u1071 > \u1039 \u1010 ;"
              + "\u1072 > \u1039 \u1010 ;"
              + "\u1096 > \u1039 \u1010 \u103D;"
              + "\u1073 > \u1039 \u1011 ;"
              + "\u1074 > \u1039 \u1011 ;"
              + "\u1075 > \u1039 \u1012 ;"
              + "\u1076 > \u1039 \u1013 ;"
              + "\u1077 > \u1039 \u1014 ;"
              + "\u1078 > \u1039 \u1015 ;"
              + "\u1079 > \u1039 \u1016 ;"
              + "\u107A > \u1039 \u1017 ;"
              + "\u107B > \u1039 \u1018 ;"
              + "\u1093 > \u1039 \u1018 ;"
              + "\u107C > \u1039 \u1019 ;"
              + "\u1085 > \u1039 \u101C ;"
              + "\u106E > \u100D\u1039\u100D ;"
              + "\u106F > \u100D\u1039\u100E ;"
              + "\u1091 > \u100F\u1039\u100D ;"
              + "\u1092 > \u100B\u1039\u100C ;"
              + "\u1097 > \u100B\u1039\u100B ;"
              + "\u104E > \u104E\u1004\u103A\u1038 ;"
              // #### STAGE (2): POST REORDERING RULES FOR UNICODE RENDERING
              + "::Null;"
              + "\u1044 \u103a > | \u104E \u103A ;"
              + "($nondigits) \u1040 ([\u102B-\u103F]) > $1 \u101D $2;"
              + "\u1031 \u1040 ($nondigits) > \u1031 \u101D $1;"
              + "\u1025 \u103A > \u1009 \u103A;"
              + "\u1025 \u102E > \u1026;"
              + "\u1037\u103A > \u103A\u1037;"
              + "\u1036 ($umedial*) ($vowelsign+) > $1 $2 \u1036 ;"
              + "([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) > $2 $1;"
              + "\u103C ($consonant) > $1 \u103C;"

              // #### Stage 3
              + "::Null;"
              + "([\u1031]+) $ukinzi ($consonant) > $ukinzi $2 $1;"
              + "([\u1031]+) ($consonant) ($umedial+) > $2 $3 $1;"
              + "([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] > $2 $1;"
              + "\u103C \u103A \u1039 ($consonant) > \u103A \u1039 $1 \u103C;"
              + "\u1036 ($umedial+) > $1 \u1036;"
              // #### Stage 4
              + "::Null;"
              + "([\u103C\u103D\u103E]+) \u103B > \u103B $1;"
              + "([\u103D\u103E]+) \u103C > \u103C $1;"
              + "\u103E\u103D > \u103D\u103E ;"
              + "([\u1031]+) ($vowelsign*) \u1039 ($consonant) > \u1039 $3 $1 $2;"
              + "($vowelsign+) \u1039 ($consonant) > \u1039 $2 $1;"
              + "($umedial*) ([\u1031]+) ($umedial*) > $1 $3 $2;"
              + "\u1037 ([\u102D-\u1030\u1032\u1036]) > $1 \u1037;"
              + "\u1037 ($umedial+) > $1 \u1037;"
              + "($vowelsign+) ($umedial+) > $2 $1;"
              + "($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant)> $1 \u103A $2 $3;"
              // #### Stage 5.  More reorderings
              + "::Null;"
              + "([\u1031]+) ($umedial+) > $2 $1;"
              + "($vowelsign) ($umedial) > $2 $1;"
              + "([\u103C\u103D\u103E]) \u103B > \u103B $1;"
              + "([\u103D\u103E]) \u103C > \u103C $1;"
              + "\u103E\u103D > \u103D\u103E ;"
              + "\u1038 ([$vowelmedial]) > $1 \u1038;"
              + "\u1038 ([\u1036\u1037\u103A]) > $1 \u1038;"
              // ### Stage 6
              + "::Null;"
              + "($consonant) \u103B \u103A > $1 \u103A \u103B;"
              + "([\u103C\u103D\u103E]) \u103B > \u103B $1;"
              + "([\u103D\u103E]) \u103C > \u103C $1;"
              + "\u103E\u103D > \u103D\u103E ;"
              + "([\u102D-\u1030\u1032]) \u103A ($consonant) \u103A > $1 $2 \u103A;"
              + "\u102F \u103A > \u102F;"
              + "\u102D \u102E > \u102E;"
              + "\u102F \u1030 > \u102F;"
              + "\u102B [\u102B]+ > \u102B;"
              + "\u102C [\u102C]+ > \u102C;"
              + "\u102D [\u102D]+ > \u102D;"
              + "\u102E [\u102E]+ > \u102E;"
              + "\u102F [\u102F]+ > \u102F;"
              + "\u1030 [\u1030]+ > \u1030;"
              + "\u1031 [\u1031]+ > \u1031;"
              + "\u1032 [\u1032]+ > \u1032;"
              + "\u103A [\u103A]+ > \u103A;"
              + "\u103B [\u103B]+ > \u103B;"
              + "\u103C [\u103C]+ > \u103C;"
              + "\u103D [\u103D]+ > \u103D;"
              + "\u103E [\u103E]+ > \u103E;"
              // Try to correctly render diacritics after a space.
              + "$space([\u102e\u1037\u103a]) > \u00A0 $1 ;",
          Transliterator.FORWARD);

  // TODO(ccorn): set a filter on this to restrict to range \u1000-\u109f ???

  /**
   * Detects Zawgyi encoding in specified input.
   *
   * @param value the string to be tested
   * @return True if text is Zawgyi encoded. False if Unicode
   */
  public static Boolean isZawgyiEncoded(String value) {
    Matcher matcher = ZAWGYI_DETECT_PATTERN.matcher(value);
    return matcher.find();
  }

  /**
   * Converts Zawgyi-encoded string into Unicode equivalent.
   *
   * @param value the Zawgyi string to be converted
   * @return the Unicode string from converstion
   */
  public static String convertZawgyiToUnicode(String value) {
    return zawgyiUnicodeTransliterator.transform(value);
  }

  /**
   * Normalizes Burmese characters in specified input, detecting and converting Zawgyi encoding to
   * Unicode form.
   *
   * @param value the string to be normalized
   * @return the normalized Unicode string
   */
  public static String standardizeMyanmar(String value) {
    if (isZawgyiEncoded(value)) {
      // Call the converter to produce a Unicode result.
      return zawgyiUnicodeTransliterator.transform(value);
    }
    return value; // Unchanged since it was not Zawgyi.
  }
}
Exemple #17
0
/** Utility to generate the Tansliteration resource bundle files. */
public class ConvertTransforms extends CLDRConverterTool {

  private static final int HELP1 = 0,
      HELP2 = 1,
      SOURCEDIR = 2,
      DESTDIR = 3,
      MATCH = 4,
      SKIP_COMMENTS = 5,
      WRITE_INDEX = 6,
      VERBOSE = 7;

  private static final UOption[] options = {
    UOption.HELP_H(),
    UOption.HELP_QUESTION_MARK(),
    UOption.SOURCEDIR().setDefault(CLDRPaths.COMMON_DIRECTORY + "transforms/"),
    UOption.DESTDIR().setDefault(CLDRPaths.GEN_DIRECTORY + "icu-transforms/"),
    UOption.create("match", 'm', UOption.REQUIRES_ARG).setDefault(".*"),
    UOption.create("commentSkip", 'c', UOption.NO_ARG),
    UOption.create("writeIndex", 'x', UOption.NO_ARG),
    UOption.VERBOSE(),
  };

  static final String HELP_TEXT1 =
      "Use the following options"
          + XPathParts.NEWLINE
          + "-h or -?\t for this message"
          + XPathParts.NEWLINE
          + "-"
          + options[SOURCEDIR].shortName
          + "\t source directory. Default = -s"
          + CldrUtility.getCanonicalName(CLDRPaths.MAIN_DIRECTORY)
          + XPathParts.NEWLINE
          + "\tExample:-sC:\\Unicode-CVS2\\cldr\\common\\gen\\source\\"
          + XPathParts.NEWLINE
          + "-"
          + options[DESTDIR].shortName
          + "\t destination directory. Default = -d"
          + CldrUtility.getCanonicalName(CLDRPaths.GEN_DIRECTORY + "main/")
          + XPathParts.NEWLINE
          + "-m<regex>\t to restrict the files to what matches <regex>"
          + XPathParts.NEWLINE
      // "--writeIndex / -x   to write the index (trnsfiles.mk)"+ XPathParts.NEWLINE
      ;

  // TODO add options to set input and output directories, matching pattern
  public static void main(String[] args) throws Exception {
    ConvertTransforms ct = new ConvertTransforms();
    ct.processArgs(args);
  }

  private boolean skipComments;
  private boolean writeIndex = false;
  private boolean verbose = false;

  int fileCount = 0;

  public void writeTransforms(String inputDirectory, String matchingPattern, String outputDirectory)
      throws IOException {
    System.out.println(new File(inputDirectory).getCanonicalPath());
    Factory cldrFactory = Factory.make(inputDirectory, matchingPattern);
    Set<String> ids = cldrFactory.getAvailable();
    PrintWriter index = BagFormatter.openUTF8Writer(outputDirectory, "root.txt");
    doHeader(index, "//", "root.txt");
    try {
      index.println("root {");
      index.println("    RuleBasedTransliteratorIDs {");
      // addAlias(index, "Latin", "el", "", "Latin", "Greek", "UNGEGN");
      // addAlias(index, "el", "Latin", "", "Greek", "Latin", "UNGEGN");
      // addAlias(index, "Latin", "Jamo", "", "Latin", "ConjoiningJamo", "");
      addAlias(index, "Tone", "Digit", "", "Pinyin", "NumericPinyin", "");
      addAlias(index, "Digit", "Tone", "", "NumericPinyin", "Pinyin", "");
      // addAlias(index, "Simplified", "Traditional", "", "Hans", "Hant", "");
      // addAlias(index, "Traditional", "Simplified", "", "Hant", "Hans", "");
      for (String id : ids) {
        if (id.equals("All")) continue;
        try {
          convertFile(cldrFactory, id, outputDirectory, index);
        } catch (IOException e) {
          System.err.println("Failure in: " + id);
          throw e;
        }
      }
      index.println("    }");
      index.println("    TransliteratorNamePattern {");
      index.println("        // Format for the display name of a Transliterator.");
      index.println("        // This is the language-neutral form of this resource.");
      index.println("        \"{0,choice,0#|1#{1}|2#{1}-{2}}\" // Display name");
      index.println("    }");
      index.println("    // Transliterator display names");
      index.println("    // This is the English form of this resource.");
      index.println("    \"%Translit%Hex\"         { \"%Translit%Hex\" }");
      index.println("    \"%Translit%UnicodeName\" { \"%Translit%UnicodeName\" }");
      index.println("    \"%Translit%UnicodeChar\" { \"%Translit%UnicodeChar\" }");
      index.println("    TransliterateLATIN{        ");
      index.println("    \"\",");
      index.println("    \"\"");
      index.println("    }");
      index.println("}");
    } finally {
      index.close();
    }
  }

  public static PrintWriter makePrintWriter(ByteArrayOutputStream bytes) {
    try {
      OutputStreamWriter outStream = new OutputStreamWriter(bytes, "UTF-8");
      BufferedWriter buff = new BufferedWriter(outStream, 4 * 1024);
      PrintWriter p = new PrintWriter(buff);

      return p;
    } catch (Exception e) {
      System.err.println("Error: Could not create OutputStreamWriter.");
    }
    return null;
  }

  private void showComments(PrintWriter toilet, String value) {
    String[] lines = value.trim().split("\\r\\n?|\\n");
    for (String line : lines) {
      if (!line.startsWith("#")) {
        line = "# " + line;
      }
      toilet.println(line);
    }
  }

  private void convertFile(
      Factory cldrFactory, String id, String outputDirectory, PrintWriter index)
      throws IOException {
    PrintWriter output = null;
    String filename = null;
    CLDRFile cldrFile = cldrFactory.make(id, false);
    boolean first = true;
    for (Iterator<String> it = cldrFile.iterator("", cldrFile.getComparator()); it.hasNext(); ) {
      String path = it.next();
      if (path.indexOf("/version") >= 0 || path.indexOf("/generation") >= 0) {
        continue;
      }
      String value = cldrFile.getStringValue(path);
      if (first) {
        String fullPath = cldrFile.getFullXPath(path);
        filename = addIndexInfo(index, fullPath, id);
        if (filename == null) return; // not a transform file!
        output = BagFormatter.openUTF8Writer(outputDirectory, filename);
        doHeader(output, "#", filename);
        first = false;
      }
      if (path.indexOf("/comment") >= 0) {
        if (!skipComments) {
          showComments(output, value);
        }
      } else if (path.indexOf("/tRule") >= 0) {
        value = fixup.transliterate(value);
        output.println(value);
      } else {
        throw new IllegalArgumentException("Unknown element: " + path + "\t " + value);
      }
    }
    output.close();
  }

  public static final Transliterator fixup = Transliterator.getInstance("[:Mn:]any-hex/java");

  public static String replaceUnquoted(String value, String toReplace, String replacement) {
    // quick exit in most cases
    if (value.indexOf(toReplace) < 0) return value;

    String updatedValue = "";
    int segmentStart = 0;
    boolean inQuotes = false;
    boolean ignoreCharValue = false;
    int length = value.length();

    for (int pos = 0; pos < length; ++pos) {
      char curChar = (char) 0;

      if (ignoreCharValue) {
        ignoreCharValue = false;
      } else {
        curChar = value.charAt(pos);
      }

      if (curChar == '\\') {
        // escape, ignore the value of the next char (actually the next UTF16 code unit, but that
        // works here)
        ignoreCharValue = true;
      }
      boolean isLastChar = (pos + 1 >= length);
      if (curChar == '\'' || isLastChar) {
        // quote, begin or end of a quoted literal (in which no replacement takes place)
        if (inQuotes) {
          // End of a quoted segment; guaranteed to include at least opening quote.
          // Just add the segment (including current char) to updatedValue.
          updatedValue = updatedValue + value.substring(segmentStart, pos + 1);
          segmentStart = pos + 1;
        } else {
          if (isLastChar) ++pos;
          if (pos > segmentStart) {
            // End of a nonempty unquoted segment; perform requested replacements and
            // then add segment to updatedValue.
            String currentSegment = value.substring(segmentStart, pos);
            updatedValue = updatedValue + currentSegment.replace(toReplace, replacement);
            segmentStart = pos;
          }
        }
        inQuotes = !inQuotes;
      }
      // else the char just becomes part of the current segment
    }
    return updatedValue;
  }

  static XPathParts parts = new XPathParts();

  private String addIndexInfo(PrintWriter index, String path, String transID) {
    parts.set(path);
    Map<String, String> attributes = parts.findAttributes("transform");
    if (attributes == null) return null; // error, not a transform file
    String source = attributes.get("source");
    String target = attributes.get("target");
    String variant = attributes.get("variant");
    String direction = attributes.get("direction");
    String alias = attributes.get("alias");
    String backwardAlias = attributes.get("backwardAlias");
    String visibility = attributes.get("visibility");

    String status = "internal".equals(visibility) ? "internal" : "file";

    fileCount++;

    String id = source + "-" + target;
    String rid = target + "-" + source;
    String filename = source + "_" + target;
    if (variant != null) {
      id += "/" + variant;
      rid += "/" + variant;
      filename += "_" + variant;
    }
    filename += ".txt";

    if (direction.equals("both") || direction.equals("forward")) {
      if (verbose) {
        System.out.println("    " + id + "    " + filename + "    " + "FORWARD");
      }
      if (alias != null) {
        for (String ali : alias.trim().split("\\s+")) {
          addAlias(index, ali, id);
        }
      }
      index.println("        " + id + " {");
      index.println("            " + status + " {");
      index.println("                resource:process(transliterator) {\"" + filename + "\"}");
      index.println("                direction {\"FORWARD\"}");
      index.println("            }");
      index.println("        }");
    }
    if (direction.equals("both") || direction.equals("backward")) {
      if (verbose) {
        System.out.println("    " + rid + "    " + filename + "    " + "REVERSE");
      }
      if (backwardAlias != null) {
        for (String bali : backwardAlias.trim().split("\\s+")) {
          addAlias(index, bali, rid);
        }
      }
      index.println("        " + rid + " {");
      index.println("            " + status + " {");
      index.println("                resource:process(transliterator) {\"" + filename + "\"}");
      index.println("                direction {\"REVERSE\"}");
      index.println("            }");
      index.println("        }");
    }
    index.println();
    return filename;
  }

  void addAlias(
      PrintWriter index,
      String aliasSource,
      String aliasTarget,
      String aliasVariant,
      String originalSource,
      String originalTarget,
      String originalVariant) {
    // Spacedhan-Han {
    // alias {"null"}
    // }
    addAlias(
        index,
        getName(aliasSource, aliasTarget, aliasVariant),
        getName(originalSource, originalTarget, originalVariant));
  }

  private void addAlias(PrintWriter index, String alias, String original) {
    index.println("        " + alias + " {");
    index.println("            alias" + " {\"" + original + "\"}");
    index.println("        }");
  }

  String getName(String source, String target, String variant) {
    String id = source + "-" + target;
    if (variant != null && variant.length() != 0) {
      id += "/" + variant;
    }
    return id;
  }

  private void doHeader(PrintWriter output, String quoteSymbol, String filename) {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy");
    output.print('\uFEFF');
    output.println(
        quoteSymbol
            + " ***************************************************************************");
    output.println(quoteSymbol + " *");
    output.println(
        quoteSymbol
            + " *  Copyright (C) 2004-"
            + sdf.format(new Date())
            + ", International Business Machines");
    output.println(
        quoteSymbol + " *  Corporation; Unicode, Inc.; and others.  All Rights Reserved.");
    output.println(quoteSymbol + " *");
    output.println(
        quoteSymbol
            + " ***************************************************************************");
    output.println(quoteSymbol + " File: " + filename);
    output.println(quoteSymbol + " Generated from CLDR ");
    output.println(quoteSymbol + "");
  }

  public void processArgs(String[] args) {
    UOption.parseArgs(args, options);
    if (options[HELP1].doesOccur || options[HELP2].doesOccur) {
      System.out.println(HELP_TEXT1);
      return;
    }

    String sourceDir = options[SOURCEDIR].value; // Utility.COMMON_DIRECTORY + "transforms/";
    String targetDir = options[DESTDIR].value; // Utility.GEN_DIRECTORY + "main/";
    String match = options[MATCH].value;
    skipComments = options[SKIP_COMMENTS].doesOccur;
    writeIndex = options[WRITE_INDEX].doesOccur;
    verbose = options[VERBOSE].doesOccur;

    try {
      if (writeIndex) {
        throw new InternalError("writeIndex not implemented.");
      } else {
        ElapsedTimer et = new ElapsedTimer();
        writeTransforms(sourceDir, match, targetDir + File.separator);
        System.out.println("ConvertTransforms: wrote " + fileCount + " files in " + et);
      }
    } catch (IOException ex) {
      RuntimeException e = new RuntimeException();
      e.initCause(ex.getCause());
      throw e;
    } finally {
      System.out.println("DONE");
    }
  }

  // fixData ONLY NEEDED TO FIX FILE PROBLEM
  /*
   * private void fixData(String inputDirectory, String matchingPattern, String outputDirectory) throws IOException {
   * File dir = new File(inputDirectory);
   * File[] files = dir.listFiles();
   * for (int i = 0; i < files.length; ++i) {
   * if (files[i].isDirectory()) continue;
   * BufferedReader input = BagFormatter.openUTF8Reader("", files[i].getCanonicalPath());
   * PrintWriter output = BagFormatter.openUTF8Writer("", outputDirectory + files[i].getName());
   * while (true) {
   * String line = input.readLine();
   * if (line == null) break;
   * if (line.indexOf("DOCTYPE") >= 0) {
   * line = line.replaceAll(" ldml ", " supplementalData ");
   * }
   * output.println(line);
   * }
   * input.close();
   * output.close();
   * }
   * }
   */

}