示例#1
0
文件: GC.java 项目: tibetan-nlp/ttt
 /** Quasi-XML for humans */
 public String toString() {
   return "<GC valid="
       + valid
       + " pretty=\""
       + UnicodeUtils.unicodeStringToPrettyString(getNfthdl())
       + "\"/>";
 }
示例#2
0
文件: GC.java 项目: tibetan-nlp/ttt
 /** Returns NFTHDL-decomposed Unicode representing this grapheme cluster. */
 private void setNfthdl(String nfthdl) {
   if (debug) {
     System.out.println("debug: GC is " + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
   }
   this.nfthdl = nfthdl;
   ThdlDebug.verify(nfthdl.length() > 0); // TODO(dchandler): assert only
   if (nfthdl.length() < 1) valid = false;
   valid = validGcRegex.matcher(nfthdl).matches();
 }
示例#3
0
文件: GC.java 项目: tibetan-nlp/ttt
 /**
  * Returns EWTS that is valid but not beautiful. It's better suited for consumption by computer
  * programs than by humans, though it'll do in a pinch. (Humans like to see [rnams] instead of
  * [r+namasa].)
  *
  * @return null if this grapheme cluster has no valid EWTS representation or valid-but-ugly EWTS
  *     otherwise
  */
 public StringBuffer getEwtsForComputers() {
   if (!valid) {
     return null;
   }
   StringBuffer sb = new StringBuffer();
   // We use ch after the loop.  Initialization is not really
   // needed; it's just to avoid compiler errors.
   char ch = 'X';
   boolean seenVowel = false;
   String lastEwts = "";
   boolean added_aVOWEL = false;
   for (int i = 0; i < nfthdl.length(); i++) {
     ch = nfthdl.charAt(i);
     String ewts = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
     if (i + 1 < nfthdl.length()) { // lookahead
       // Even computers want to see kI because the spec
       // isn't (or at least hasn't always been) crystal
       // clear that kA+i is equivalent to kI.
       if (('\u0f55' == ch || '\u0fa5' == ch) && '\u0f39' == nfthdl.charAt(i + 1)) {
         ++i;
         ewts = "f"; // TODO(dchandler): hard-coded EWTS
       } else if (('\u0f56' == ch || '\u0fa6' == ch) && '\u0f39' == nfthdl.charAt(i + 1)) {
         ++i;
         ewts = "v"; // TODO(dchandler): hard-coded EWTS
       } else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
         ++i;
         ewts = THDLWylieConstants.I_VOWEL;
         // NOTE: we could normalize to 0f73 and 0f75 when
         // possible in NFTHDL.  That's closer to EWTS and
         // would avoid these two special cases.
       } else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
         ++i;
         ewts = THDLWylieConstants.U_VOWEL;
       }
     }
     if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
       return null;
     }
     if (UnicodeUtils.isSubjoinedConsonant(ch) || (seenVowel && isVowel(ch)))
       sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
     if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
       if (!added_aVOWEL) {
         added_aVOWEL = true;
         sb.append(THDLWylieConstants.WYLIE_aVOWEL); // paM, no pM
       }
     }
     if (isVowel(ch)) {
       seenVowel = true;
       if (lastEwts == "a") {
         sb.deleteCharAt(sb.length() - 1);
       }
     }
     sb.append(ewts);
     lastEwts = ewts;
   }
   if ((UnicodeUtils.isNonSubjoinedConsonant(ch)
           || UnicodeUtils.isSubjoinedConsonant(ch)
           || '\u0f39' == ch)
       && '\u0f68' != ch) {
     ThdlDebug.verify(!added_aVOWEL);
     sb.append(THDLWylieConstants.WYLIE_aVOWEL);
   }
   return sb;
 }