Пример #1
0
/** Class for detecting and converting Zawgyi-encoded data. */
public class MyanmarZawgyiConverter {
  // For detecting if Myanmar text is encoded with Zawgyi vs. Unicode characters.

  private static final Pattern ZAWGYI_DETECT_PATTERN =
      PatternCache.get(
          // A regular expression matched if text is Zawgyi encoding.
          // Using the ranges 1033-1034 or 1060-1097 will report Shan, Karen,
          // etc. as Zawgyi.
          "[\u105a\u1060-\u1097]|" // Zawgyi characters outside Unicode range
              + "[\u1033\u1034]|" // These are Mon characters
              + "\u1031\u108f|"
              + "\u1031[\u103b-\u103e]|" // Medial right after \u1031
              + "[\u102b-\u1030\u1032]\u1031|" // Vowel sign right after before \u1031
              + " \u1031| \u103b|" // Unexpected characters after a space
              + "^\u1031|^\u103b|\u1038\u103b|\u1038\u1031|"
              + "[\u102d\u102e\u1032]\u103b|\u1039[^\u1000-\u1021]|\u1039$"
              + "|\u1004\u1039[\u1001-\u102a\u103f\u104e]" // Missing ASAT in Kinzi
              + "|\u1039[^u1000-\u102a\u103f\u104e]" // 1039 not before a consonant
              // Out of order medials
              + "|\u103c\u103b|\u103d\u103b"
              + "|\u103e\u103b|\u103d\u103c"
              + "|\u103e\u103c|\u103e\u103d"
              // Bad medial combos
              + "|\u103b\u103c"
              // Out of order vowel signs
              + "|[\u102f\u1030\u102b\u102c][\u102d\u102e\u1032]"
              + "|[\u102b\u102c][\u102f\u102c]"
              // Digit before diacritic
              + "|[\u1040-\u1049][\u102b-\u103e\u102b-\u1030\u1032\u1036\u1037\u1038\u103a]"
              // Single digit 0, 7 at start
              + "|^[\u1040\u1047][^\u1040-\u1049]"
              // Second 1039 with bad followers
              + "|[\u1000-\u102a\u103f\u104e]\u1039[\u101a\u101b\u101d\u101f\u1022-\u103f]"
              // Other bad combos.
              + "|\u103a\u103e"
              + "|\u1036\u102b]"
              // multiple upper vowels
              + "|\u102d[\u102e\u1032]|\u102e[\u102d\u1032]|\u1032[\u102d\u102e]"
              // Multiple lower vowels
              + "|\u102f\u1030|\u1030\u102f"
              // Multiple A vowels
              + "|\u102b\u102c|\u102c\u102b"
              // Shan digits with vowels or medials or other signs
              + "|[\u1090-\u1099][\u102b-\u1030\u1032\u1037\u103a-\u103e]"
              // Isolated Shan digit
              + "|[\u1000-\u10f4][\u1090-\u1099][\u1000-\u104f]"
              + "|^[\u1090-\u1099][\u1000-\u102a\u103f\u104e\u104a\u104b]"
              + "|[\u1000-\u104f][\u1090-\u1099]$"
              // Diacritics with non-Burmese vowel signs
              + "|[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074\u1082-\u108d"
              + "\u108f\u109a-\u109d]"
              + "[\u102b-\u103e]"
              // Consonant 103a + some vowel signs
              + "|[\u1000-\u102a]\u103a[\u102d\u102e\u1032]"
              // 1031 after other vowel signs
              + "|[\u102b-\u1030\u1032\u1036-\u1038\u103a]\u1031"
              // Using Shan combining characters with other languages.
              + "|[\u1087-\u108d][\u106e-\u1070\u1072-\u1074]"
              // Non-Burmese diacritics at start, following space, or following sections
              + "|^[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074"
              + "\u1082-\u108d\u108f\u109a-\u109d]"
              + "|[\u0020\u104a\u104b][\u105e-\u1060\u1062-\u1064\u1067-\u106d"
              + "\u1071-\u1074\u1082-\u108d\u108f\u109a-\u109d]"
              // Wrong order with 1036
              + "|[\u1036\u103a][\u102d-\u1030\u1032]"
              // Odd stacking
              + "|[\u1025\u100a]\u1039"
              // More mixing of non-Burmese languages
              + "|[\u108e-\u108f][\u1050-\u108d]"
              // Bad diacritic combos.
              + "|\u102d-\u1030\u1032\u1036-\u1037]\u1039]"
              // Dot before subscripted consonant
              + "|[\u1000-\u102a\u103f\u104e]\u1037\u1039"
              // Odd subscript + vowel signs
              + "|[\u1000-\u102a\u103f\u104e]\u102c\u1039[\u1000-\u102a\u103f\u104e]"
              // Medials after vowels
              + "|[\u102b-\u1030\u1032][\u103b-\u103e]"
              // Medials
              + "|\u1032[\u103b-\u103e]"
              // Medial with 101b
              + "|\u101b\u103c"
              // Stacking too deeply: consonant 1039 consonant 1039 consonant
              + "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]\u1039"
              + "[\u1000-\u102a\u103f\u104e]"
              // Stacking pattern consonant 1039 consonant 103a other vowel signs
              + "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]"
              + "[\u102b\u1032\u103d]"
              // Odd stacking over u1021, u1019, and u1000
              + "|[\u1000\u1005\u100f\u1010\u1012\u1014\u1015\u1019\u101a]\u1039\u1021"
              + "|[\u1000\u1010]\u1039\u1019"
              + "|\u1004\u1039\u1000"
              + "|\u1015\u1039[\u101a\u101e]"
              + "|\u1000\u1039\u1001\u1036"
              + "|\u1039\u1011\u1032"
              // Vowel sign in wrong order
              + "|\u1037\u1032"
              + "|\u1036\u103b"
              // Duplicated vowel
              + "|\u102f\u102f");

  // Transliteration to convert Burmese text in Zawgyi-encoded string to
  // standard Unicode codepoints and ordering.
  static final Transform<String, String> zawgyiUnicodeTransliterator =
      // Transliteration rules, 07-Jan-2014.
      Transliterator.createFromRules(
          "zawgyi-unicode",
          // Modern Burmese digits & Unicode code points.
          "$nondigits = [^\u1040-\u1049];"
              + "$space = ' ';"
              + "$consonant = [\u1000-\u1021];"
              + "$vowelsign = [\u102B-\u1030\u1032];"
              + "$umedial = [\u103B-\u103E];"
              + "$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F];"
              + "$ukinzi = \u1004\u103A\u1039;"
              + "$zmedialra = [\u103B\u107E-\u1084];"
              // #### STAGE (1): CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
              + "($consonant) \u103A \u1064 > $ukinzi $1 \u103B;"
              + "($consonant) \u1064 > $ukinzi $1;"
              + "\u1064 > $ukinzi;"
              + "($consonant) \u108b > $ukinzi $1 \u102D;"
              + "($consonant) \u108C > $ukinzi $1 \u102E;"
              + "($consonant) \u108D > $ukinzi $1 \u1036;"
              + "($consonant) \u103A \u1033 \u108B > $ukinzi $1 \u103B \u102D \u102F;"
              + "($consonant) \u103A \u108b > $ukinzi $1 \u103B \u102D ;"
              + "($consonant) \u103A \u108C \u1033 > $ukinzi $1 \u103B \u102E \u102F;"
              + "($consonant) \u103A \u108C > $ukinzi $1 \u103B \u102E ;"
              + "($consonant) \u103A \u108D > $ukinzi $1 \u103B \u1036 ;"
              + "($consonant) \u103A \u108e > $1 \u103B \u102D \u1036 ;"
              + "\u108B > $ukinzi \u102D ;"
              + "\u108C > $ukinzi \u102E ;"
              + "\u108D > $ukinzi \u1036 ;"
              + "\u106A ($vowelsign) \u1038 > \u1025 $1 \u1038 ;"
              + "\u106A > \u1009 ;"
              + "\u106B > \u100A ;"
              + "\u108F > \u1014 ;"
              + "\u1090 > \u101B ;"
              + "\u1086 > \u103F ;"
              + "\u103A > \u103B ;"
              + "\u107D > \u103B ;"
              + "\u103C \u108A > \u103D \u103E;"
              + "\u103C > \u103D ;"
              + "\u108A > \u103D \u103E ;"
              + "\u103D > \u103E ;"
              + "\u1087 > \u103E ;"
              + "\u1088 > \u103E \u102F ;"
              + "\u1089 > \u103E \u1030 ;"
              + "\u1039 > \u103A ;"
              + "\u1033 > \u102F ;"
              + "\u1034 > \u1030 ;"
              + "\u105A > \u102B \u103A ;"
              + "\u108E > \u102D \u1036 ;"
              + "\u1031 \u1094 ($consonant) \u103D > $1 \u103E \u1031 \u1037 ;"
              + "\u1094 > \u1037 ;"
              + "\u1095 > \u1037 ;"
              + "\u1025 \u1061 > \u1009 \u1039 \u1001;"
              + "\u1025 \u1062 > \u1009 \u1039 \u1002;"
              + "\u1025 \u1065 > \u1009 \u1039 \u1005;"
              + "\u1025 \u1068 > \u1009 \u1039 \u1007;"
              + "\u1025 \u1076 > \u1009 \u1039 \u1013;"
              + "\u1025 \u1078 > \u1009 \u1039 \u1015;"
              + "\u1025 \u107A > \u1009 \u1039 \u1017;"
              + "\u1025 \u1079 > \u1009 \u1039 \u1016;"
              + "\u1060 > \u1039 \u1000 ;"
              + "\u1061 > \u1039 \u1001 ;"
              + "\u1062 > \u1039 \u1002 ;"
              + "\u1063 > \u1039 \u1003 ;"
              + "\u1065 > \u1039 \u1005 ;"
              + "\u1066 > \u1039 \u1006 ;"
              + "\u1067 > \u1039 \u1006 ;"
              + "\u1068 > \u1039 \u1007 ;"
              + "\u1069 > \u1039 \u1008 ;"
              + "\u106C > \u1039 \u100B ;"
              + "\u106D > \u1039 \u100C ;"
              + "\u1070 > \u1039 \u100F ;"
              + "\u1071 > \u1039 \u1010 ;"
              + "\u1072 > \u1039 \u1010 ;"
              + "\u1096 > \u1039 \u1010 \u103D;"
              + "\u1073 > \u1039 \u1011 ;"
              + "\u1074 > \u1039 \u1011 ;"
              + "\u1075 > \u1039 \u1012 ;"
              + "\u1076 > \u1039 \u1013 ;"
              + "\u1077 > \u1039 \u1014 ;"
              + "\u1078 > \u1039 \u1015 ;"
              + "\u1079 > \u1039 \u1016 ;"
              + "\u107A > \u1039 \u1017 ;"
              + "\u107B > \u1039 \u1018 ;"
              + "\u1093 > \u1039 \u1018 ;"
              + "\u107C > \u1039 \u1019 ;"
              + "\u1085 > \u1039 \u101C ;"
              + "\u106E > \u100D\u1039\u100D ;"
              + "\u106F > \u100D\u1039\u100E ;"
              + "\u1091 > \u100F\u1039\u100D ;"
              + "\u1092 > \u100B\u1039\u100C ;"
              + "\u1097 > \u100B\u1039\u100B ;"
              + "\u104E > \u104E\u1004\u103A\u1038 ;"
              // #### STAGE (2): POST REORDERING RULES FOR UNICODE RENDERING
              + "::Null;"
              + "\u1044 \u103a > | \u104E \u103A ;"
              + "($nondigits) \u1040 ([\u102B-\u103F]) > $1 \u101D $2;"
              + "\u1031 \u1040 ($nondigits) > \u1031 \u101D $1;"
              + "\u1025 \u103A > \u1009 \u103A;"
              + "\u1025 \u102E > \u1026;"
              + "\u1037\u103A > \u103A\u1037;"
              + "\u1036 ($umedial*) ($vowelsign+) > $1 $2 \u1036 ;"
              + "([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) > $2 $1;"
              + "\u103C ($consonant) > $1 \u103C;"

              // #### Stage 3
              + "::Null;"
              + "([\u1031]+) $ukinzi ($consonant) > $ukinzi $2 $1;"
              + "([\u1031]+) ($consonant) ($umedial+) > $2 $3 $1;"
              + "([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] > $2 $1;"
              + "\u103C \u103A \u1039 ($consonant) > \u103A \u1039 $1 \u103C;"
              + "\u1036 ($umedial+) > $1 \u1036;"
              // #### Stage 4
              + "::Null;"
              + "([\u103C\u103D\u103E]+) \u103B > \u103B $1;"
              + "([\u103D\u103E]+) \u103C > \u103C $1;"
              + "\u103E\u103D > \u103D\u103E ;"
              + "([\u1031]+) ($vowelsign*) \u1039 ($consonant) > \u1039 $3 $1 $2;"
              + "($vowelsign+) \u1039 ($consonant) > \u1039 $2 $1;"
              + "($umedial*) ([\u1031]+) ($umedial*) > $1 $3 $2;"
              + "\u1037 ([\u102D-\u1030\u1032\u1036]) > $1 \u1037;"
              + "\u1037 ($umedial+) > $1 \u1037;"
              + "($vowelsign+) ($umedial+) > $2 $1;"
              + "($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant)> $1 \u103A $2 $3;"
              // #### Stage 5.  More reorderings
              + "::Null;"
              + "([\u1031]+) ($umedial+) > $2 $1;"
              + "($vowelsign) ($umedial) > $2 $1;"
              + "([\u103C\u103D\u103E]) \u103B > \u103B $1;"
              + "([\u103D\u103E]) \u103C > \u103C $1;"
              + "\u103E\u103D > \u103D\u103E ;"
              + "\u1038 ([$vowelmedial]) > $1 \u1038;"
              + "\u1038 ([\u1036\u1037\u103A]) > $1 \u1038;"
              // ### Stage 6
              + "::Null;"
              + "($consonant) \u103B \u103A > $1 \u103A \u103B;"
              + "([\u103C\u103D\u103E]) \u103B > \u103B $1;"
              + "([\u103D\u103E]) \u103C > \u103C $1;"
              + "\u103E\u103D > \u103D\u103E ;"
              + "([\u102D-\u1030\u1032]) \u103A ($consonant) \u103A > $1 $2 \u103A;"
              + "\u102F \u103A > \u102F;"
              + "\u102D \u102E > \u102E;"
              + "\u102F \u1030 > \u102F;"
              + "\u102B [\u102B]+ > \u102B;"
              + "\u102C [\u102C]+ > \u102C;"
              + "\u102D [\u102D]+ > \u102D;"
              + "\u102E [\u102E]+ > \u102E;"
              + "\u102F [\u102F]+ > \u102F;"
              + "\u1030 [\u1030]+ > \u1030;"
              + "\u1031 [\u1031]+ > \u1031;"
              + "\u1032 [\u1032]+ > \u1032;"
              + "\u103A [\u103A]+ > \u103A;"
              + "\u103B [\u103B]+ > \u103B;"
              + "\u103C [\u103C]+ > \u103C;"
              + "\u103D [\u103D]+ > \u103D;"
              + "\u103E [\u103E]+ > \u103E;"
              // Try to correctly render diacritics after a space.
              + "$space([\u102e\u1037\u103a]) > \u00A0 $1 ;",
          Transliterator.FORWARD);

  // TODO(ccorn): set a filter on this to restrict to range \u1000-\u109f ???

  /**
   * Detects Zawgyi encoding in specified input.
   *
   * @param value the string to be tested
   * @return True if text is Zawgyi encoded. False if Unicode
   */
  public static Boolean isZawgyiEncoded(String value) {
    Matcher matcher = ZAWGYI_DETECT_PATTERN.matcher(value);
    return matcher.find();
  }

  /**
   * Converts Zawgyi-encoded string into Unicode equivalent.
   *
   * @param value the Zawgyi string to be converted
   * @return the Unicode string from converstion
   */
  public static String convertZawgyiToUnicode(String value) {
    return zawgyiUnicodeTransliterator.transform(value);
  }

  /**
   * Normalizes Burmese characters in specified input, detecting and converting Zawgyi encoding to
   * Unicode form.
   *
   * @param value the string to be normalized
   * @return the normalized Unicode string
   */
  public static String standardizeMyanmar(String value) {
    if (isZawgyiEncoded(value)) {
      // Call the converter to produce a Unicode result.
      return zawgyiUnicodeTransliterator.transform(value);
    }
    return value; // Unchanged since it was not Zawgyi.
  }
}
Пример #2
0
public class UExtension {
  static SupplementalDataInfo data =
      SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);

  static Pattern SEP = PatternCache.get("[-_]");
  static Pattern SPACE = PatternCache.get("\\s");
  static Pattern ALPHANUM = PatternCache.get("[0-9A-Za-z]{2,8}");
  static Pattern CODEPOINTS =
      PatternCache.get("(10|[0-9A-Fa-f])?[0-9A-Fa-f]{4}(\\s(10|[0-9A-Fa-f])?[0-9A-Fa-f]{4})*");
  static Relation<String, String> validKeyTypes = data.getBcp47Keys();

  private boolean validating = false;
  private SortedMap<String, List<String>> keyTypes = new TreeMap<String, List<String>>();
  private Set<String> attributes = new TreeSet<String>();

  public Set<String> getKeys() {
    return keyTypes.keySet();
  }

  public List<String> getTypes(String key) {
    return keyTypes.get(key);
  }

  public Set<String> getAttributes() {
    return attributes;
  }

  public boolean isValidating() {
    return validating;
  }

  public UExtension setValidating(boolean validating) {
    this.validating = validating;
    return this;
  }

  /**
   * Parses the subtags after the -u-
   *
   * @param source
   * @return
   */
  public UExtension parse(String source) {
    // the subtags that are up to the first two letter are attributes
    String key = null;
    List<String> list = null;
    Set<String> validSubtypes = null;
    Matcher alphanum = ALPHANUM.matcher("");

    for (String subtag : SEP.split(source)) {
      if (!alphanum.reset(subtag).matches()) {
        throw new IllegalArgumentException(
            "Invalid subtag contents, must be [0-9 A-Z a-z]{2,8}: " + subtag);
      }
      subtag = subtag.toLowerCase(Locale.ENGLISH); // normalize
      if (subtag.length() == 2) { // key
        if (list != null) { // check size of previous list
          if (list.size() == 0 || !key.equals("vt") && list.size() > 1) {
            throw new IllegalArgumentException(
                "Illegal number of subtypes for: " + key + "\t" + list);
          }
        }
        key = subtag;
        if (validating) {
          validSubtypes = validKeyTypes.getAll(key);
          if (validSubtypes == null) {
            throw new IllegalArgumentException("Invalid key: " + key);
          }
        }
        list = keyTypes.get(key);
        if (list != null) {
          throw new IllegalArgumentException("Multiple keys with same value: " + subtag);
        }
        list = new ArrayList<String>();
        keyTypes.put(key, list);
      } else { // add subtype
        if (key == null) {
          if (validating) {
            throw new IllegalArgumentException("No attributes currently valid: " + subtag);
          }
          attributes.add(subtag);
          break;
        }
        if (validating) {
          if (key.equals("vt")) {
            if (!CODEPOINTS.matcher(subtag).matches()) {
              throw new IllegalArgumentException("Illegal subtypes: " + key + "-" + subtag);
            }
          } else if (!validSubtypes.contains(subtag)) {
            throw new IllegalArgumentException("Illegal subtypes: " + key + "-" + subtag);
          }
        }
        list.add(subtag);
      }
    }
    // protect
    attributes = Collections.unmodifiableSet(attributes);
    for (String key2 : keyTypes.keySet()) {
      list = keyTypes.get(key2);
      keyTypes.put(key2, Collections.unmodifiableList(list));
    }
    keyTypes = Collections.unmodifiableSortedMap(keyTypes);
    return this;
  }

  public String toString() {
    return "{attributes=" + attributes + ", keyTypes=" + keyTypes + "}";
  }
}