Esempio n. 1
0
  /**
   * @return a list of forms of @param verb which match @param expectedVerbPOS (person:number)
   * @param toUppercase true when the suggestions should be capitalized
   */
  private List<String> getVerbSuggestions(
      final AnalyzedTokenReadings verb, final String expectedVerbPOS, final boolean toUppercase) {
    // find the first verb reading
    AnalyzedToken verbToken = new AnalyzedToken("", "", "");
    for (AnalyzedToken token : verb.getReadings()) {
      if (token.getPOSTag().startsWith("VER:")) {
        verbToken = token;
        break;
      }
    }

    try {
      String[] synthesized =
          german.getSynthesizer().synthesize(verbToken, "VER.*:" + expectedVerbPOS + ".*", true);
      // remove duplicates
      Set<String> suggestionSet = new HashSet<>();
      suggestionSet.addAll(Arrays.asList(synthesized));
      List<String> suggestions = new ArrayList<>();
      suggestions.addAll(suggestionSet);
      if (toUppercase) {
        for (int i = 0; i < suggestions.size(); ++i) {
          suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i)));
        }
      }
      Collections.sort(suggestions);
      return suggestions;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
 @Override
 protected boolean isTagged(AnalyzedTokenReadings tokenReadings) {
   for (AnalyzedToken token : tokenReadings.getReadings()) {
     String posTag = token.getPOSTag();
     if (isGoodPosTag(posTag)) {
       return true;
     }
   }
   return false;
 }
  /**
   * @return true if the verb @param token (if it is a verb) matches @param person and @param
   *     number, and matches no other person/number
   */
  private boolean hasUnambiguouslyPersonAndNumber(
      final AnalyzedTokenReadings tokenReadings, final String person, final String number) {
    if (tokenReadings.getToken().length() == 0
        || (Character.isUpperCase(tokenReadings.getToken().charAt(0))
            && !(tokenReadings.getStartPos() == 0))
        || !tokenReadings.hasPartialPosTag("VER")) return false;

    for (AnalyzedToken analyzedToken : tokenReadings) {
      final String postag = analyzedToken.getPOSTag();
      if (postag.contains("_END")) // ignore SENT_END and PARA_END
      continue;
      if (!postag.contains(":" + person + ":" + number)) return false;
    } // for each reading

    return true;
  }
Esempio n. 4
0
 @Override
 public List<AnalyzedToken> additionalTags(String word) {
   final IStemmer dictLookup;
   try {
     dictLookup = new DictionaryLookup(getDictionary());
   } catch (IOException e) {
     throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
   }
   List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
   // Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
   // Adjectiu femení singular o participi femení singular + -ment
   if (word.endsWith("ment")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = ADJ_PART_FS.matcher(posTag);
         if (m.matches()) {
           additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
           return additionalTaggedTokens;
         }
       }
     }
   }
   // Any well-formed verb with prefixes is tagged as a verb copying the original tags
   Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
   if (matcher.matches()) {
     final String possibleVerb = matcher.group(2).toLowerCase();
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = VERB.matcher(posTag);
         if (m.matches()) {
           String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
   if (word.startsWith("ex")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = NOUN.matcher(posTag);
         if (m.matches()) {
           String lemma = "ex".concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Interpret deprecated characters of "ela geminada"
   // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
   // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
   if (word.contains("\u0140") || word.contains("\u013f")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
     List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
     return taggerTokens;
   }
   return null;
 }