Java IStemmer Examples

Programming Language: Java

Namespace/Package Name: morfologik.stemming

Class/Type: IStemmer

Examples at hotexamples.com: 2

Java IStemmer - 2 examples found. These are the top rated real world Java examples of morfologik.stemming.IStemmer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

lookup(2)

Example #1

Show file

File: CatalanTagger.java Project: julian-d/languagetool

  @Override
  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
    initializeIfRequired();

    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);
      List<AnalyzedToken> manualTaggerTokens =
          manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(word));
      List<AnalyzedToken> manualLowerTaggerTokens =
          manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(lowerWord));

      // normal case, manual tagger
      addTokens(manualTaggerTokens, l);
      // normal case, tagger dictionary
      if (manualTaggerTokens.isEmpty()) {
        addTokens(asAnalyzedTokenList(word, dictLookup.lookup(word)), l);
      }
      // tag non-lowercase words (alluppercase or startuppercase but not mixedcase)
      // with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        // manual tagger
        addTokens(manualLowerTaggerTokens, l);
        // tagger dictionary
        if (manualLowerTaggerTokens.isEmpty()) {
          addTokens(asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)), l);
        }
      }
      // additional tagging with prefixes
      if (l.isEmpty() && !isMixedCase) {
        addTokens(additionalTags(word), l);
      }

      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }

      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }

    return tokenReadings;
  }

Example #2

Show file

File: CatalanTagger.java Project: julian-d/languagetool

 @Override
 public List<AnalyzedToken> additionalTags(String word) {
   final IStemmer dictLookup;
   try {
     dictLookup = new DictionaryLookup(getDictionary());
   } catch (IOException e) {
     throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
   }
   List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
   // Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
   // Adjectiu femení singular o participi femení singular + -ment
   if (word.endsWith("ment")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = ADJ_PART_FS.matcher(posTag);
         if (m.matches()) {
           additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
           return additionalTaggedTokens;
         }
       }
     }
   }
   // Any well-formed verb with prefixes is tagged as a verb copying the original tags
   Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
   if (matcher.matches()) {
     final String possibleVerb = matcher.group(2).toLowerCase();
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = VERB.matcher(posTag);
         if (m.matches()) {
           String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
   if (word.startsWith("ex")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
     List<AnalyzedToken> taggerTokens;
     taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun));
     for (AnalyzedToken taggerToken : taggerTokens) {
       final String posTag = taggerToken.getPOSTag();
       if (posTag != null) {
         final Matcher m = NOUN.matcher(posTag);
         if (m.matches()) {
           String lemma = "ex".concat(taggerToken.getLemma());
           additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
         }
       }
     }
     return additionalTaggedTokens;
   }
   // Interpret deprecated characters of "ela geminada"
   // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
   // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
   if (word.contains("\u0140") || word.contains("\u013f")) {
     final String lowerWord = word.toLowerCase(conversionLocale);
     final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
     List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
     return taggerTokens;
   }
   return null;
 }