예제 #1
0
  @Override
  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
    initializeIfRequired();

    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);
      List<AnalyzedToken> manualTaggerTokens =
          manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(word));
      List<AnalyzedToken> manualLowerTaggerTokens =
          manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(lowerWord));

      // normal case, manual tagger
      addTokens(manualTaggerTokens, l);
      // normal case, tagger dictionary
      if (manualTaggerTokens.isEmpty()) {
        addTokens(asAnalyzedTokenList(word, dictLookup.lookup(word)), l);
      }
      // tag non-lowercase words (alluppercase or startuppercase but not mixedcase)
      // with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        // manual tagger
        addTokens(manualLowerTaggerTokens, l);
        // tagger dictionary
        if (manualLowerTaggerTokens.isEmpty()) {
          addTokens(asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)), l);
        }
      }
      // additional tagging with prefixes
      if (l.isEmpty() && !isMixedCase) {
        addTokens(additionalTags(word), l);
      }

      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }

      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }

    return tokenReadings;
  }
예제 #2
0
 @Override
 public List<AnalyzedToken> additionalTags(String word) {
   final IStemmer dictLookup;
   try {
     dictLookup = new DictionaryLookup(getDictionary());
   } catch (IOException e) {
     throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
   }
   List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
   // Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
   // Adjectiu femení singular o participi femení singular + -ment
   if (word.endsWith("ment")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = ADJ_PART_FS.matcher(posTag);
         if (m.matches()) {
           additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
           return additionalTaggedTokens;
         }
       }
     }
   }
   // Any well-formed verb with prefixes is tagged as a verb copying the original tags
   Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
   if (matcher.matches()) {
     final String possibleVerb = matcher.group(2).toLowerCase();
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = VERB.matcher(posTag);
         if (m.matches()) {
           String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
   if (word.startsWith("ex")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = NOUN.matcher(posTag);
         if (m.matches()) {
           String lemma = "ex".concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Interpret deprecated characters of "ela geminada"
   // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
   // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
   if (word.contains("\u0140") || word.contains("\u013f")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
     List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
     return taggerTokens;
   }
   return null;
 }