예제 #1
0
 /**
  * Gets the character extended type
  *
  * @param ch character to be tested
  * @return extended type it is associated with
  */
 private static int getType(int ch) {
   if (UCharacterUtility.isNonCharacter(ch)) {
     // not a character we return a invalid category count
     return NON_CHARACTER_;
   }
   int result = UCharacter.getType(ch);
   if (result == UCharacterCategory.SURROGATE) {
     if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
       result = LEAD_SURROGATE_;
     } else {
       result = TRAIL_SURROGATE_;
     }
   }
   return result;
 }
예제 #2
0
    public static CasingType from(String s) {
      if (s == null || s.length() == 0) {
        return other;
      }
      int cp;
      // Look for the first meaningful character in the string to determine case.
      for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
        cp = s.codePointAt(i);
        // used to skip the placeholders, but works better to have them be 'other'
        // if (cp == '{') {
        // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
        // i = placeholder.end() - 1; // skip
        // continue;
        // }
        // }
        int type = UCharacter.getType(cp);
        switch (type) {
          case UCharacter.LOWERCASE_LETTER:
            return lowercase;

          case UCharacter.UPPERCASE_LETTER:
          case UCharacter.TITLECASE_LETTER:
            return titlecase;

            // for other letters / numbers / symbols, return other
          case UCharacter.OTHER_LETTER:
          case UCharacter.DECIMAL_DIGIT_NUMBER:
          case UCharacter.LETTER_NUMBER:
          case UCharacter.OTHER_NUMBER:
          case UCharacter.MATH_SYMBOL:
          case UCharacter.CURRENCY_SYMBOL:
          case UCharacter.MODIFIER_SYMBOL:
          case UCharacter.OTHER_SYMBOL:
            return other;
            // ignore everything else (whitespace, punctuation, etc) and keep going
        }
      }
      return other;
    }
예제 #3
0
 /**
  * Set an identifier to analyze. Afterwards, call methods like getScripts()
  *
  * @param identifier the identifier to analyze
  * @return self
  * @internal
  * @deprecated This API is ICU internal only.
  */
 @Deprecated
 public IdentifierInfo setIdentifier(String identifier) {
   this.identifier = identifier;
   clear();
   BitSet scriptsForCP = new BitSet();
   int cp;
   for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
     cp = Character.codePointAt(identifier, i);
     // Store a representative character for each kind of decimal digit
     if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
       // Just store the zero character as a representative for comparison. Unicode guarantees it
       // is cp - value
       numerics.add(cp - UCharacter.getNumericValue(cp));
     }
     UScript.getScriptExtensions(cp, scriptsForCP);
     scriptsForCP.clear(UScript.COMMON);
     scriptsForCP.clear(UScript.INHERITED);
     //            if (temp.cardinality() == 0) {
     //                // HACK for older version of ICU
     //                requiredScripts.set(UScript.getScript(cp));
     //            } else
     switch (scriptsForCP.cardinality()) {
       case 0:
         break;
       case 1:
         // Single script, record it.
         requiredScripts.or(scriptsForCP);
         break;
       default:
         if (!requiredScripts.intersects(scriptsForCP) && scriptSetSet.add(scriptsForCP)) {
           scriptsForCP = new BitSet();
         }
         break;
     }
   }
   // Now make a final pass through to remove alternates that came before singles.
   // [Kana], [Kana Hira] => [Kana]
   // This is relatively infrequent, so doesn't have to be optimized.
   // We also compute any commonalities among the alternates.
   if (scriptSetSet.size() > 0) {
     commonAmongAlternates.set(0, UScript.CODE_LIMIT);
     for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext(); ) {
       final BitSet next = it.next();
       // [Kana], [Kana Hira] => [Kana]
       if (requiredScripts.intersects(next)) {
         it.remove();
       } else {
         // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
         commonAmongAlternates.and(next); // get the intersection.
         for (BitSet other : scriptSetSet) {
           if (next != other && contains(next, other)) {
             it.remove();
             break;
           }
         }
       }
     }
   }
   if (scriptSetSet.size() == 0) {
     commonAmongAlternates.clear();
   }
   return this;
 }