@Override public List<AnalyzedToken> additionalTags(String word) { final IStemmer dictLookup; try { dictLookup = new DictionaryLookup(getDictionary()); } catch (IOException e) { throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e); } List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>(); // Any well-formed adverb with suffix -ment is tagged as an adverb (RG) // Adjectiu femení singular o participi femení singular + -ment if (word.endsWith("ment")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1"); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = ADJ_PART_FS.matcher(posTag); if (m.matches()) { additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord)); return additionalTaggedTokens; } } } } // Any well-formed verb with prefixes is tagged as a verb copying the original tags Matcher matcher = PREFIXES_FOR_VERBS.matcher(word); if (matcher.matches()) { final String possibleVerb = matcher.group(2).toLowerCase(); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = VERB.matcher(posTag); if (m.matches()) { String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma()); additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma)); } } } return additionalTaggedTokens; } // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags if (word.startsWith("ex")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1"); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = NOUN.matcher(posTag); if (m.matches()) { String lemma = "ex".concat(taggerToken.getLemma()); additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma)); } } } return additionalTaggedTokens; } // Interpret deprecated characters of "ela geminada" // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT if (word.contains("\u0140") || word.contains("\u013f")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleWord = lowerWord.replaceAll("\u0140", "l·"); List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord)); return taggerTokens; } return null; }