コード例 #1
0
 @Override
 public List<AnalyzedToken> additionalTags(String word) {
   final IStemmer dictLookup;
   try {
     dictLookup = new DictionaryLookup(getDictionary());
   } catch (IOException e) {
     throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
   }
   List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
   // Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
   // Adjectiu femení singular o participi femení singular + -ment
   if (word.endsWith("ment")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = ADJ_PART_FS.matcher(posTag);
         if (m.matches()) {
           additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
           return additionalTaggedTokens;
         }
       }
     }
   }
   // Any well-formed verb with prefixes is tagged as a verb copying the original tags
   Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
   if (matcher.matches()) {
     final String possibleVerb = matcher.group(2).toLowerCase();
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = VERB.matcher(posTag);
         if (m.matches()) {
           String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
   if (word.startsWith("ex")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = NOUN.matcher(posTag);
         if (m.matches()) {
           String lemma = "ex".concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Interpret deprecated characters of "ela geminada"
   // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
   // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
   if (word.contains("\u0140") || word.contains("\u013f")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
     List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
     return taggerTokens;
   }
   return null;
 }