@Override public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException { initializeIfRequired(); final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>(); int pos = 0; final IStemmer dictLookup = new DictionaryLookup(getDictionary()); for (String word : sentenceTokens) { final List<AnalyzedToken> l = new ArrayList<>(); final String lowerWord = word.toLowerCase(conversionLocale); final boolean isLowercase = word.equals(lowerWord); final boolean isMixedCase = StringTools.isMixedCase(word); List<AnalyzedToken> manualTaggerTokens = manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(word)); List<AnalyzedToken> manualLowerTaggerTokens = manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(lowerWord)); // normal case, manual tagger addTokens(manualTaggerTokens, l); // normal case, tagger dictionary if (manualTaggerTokens.isEmpty()) { addTokens(asAnalyzedTokenList(word, dictLookup.lookup(word)), l); } // tag non-lowercase words (alluppercase or startuppercase but not mixedcase) // with lowercase word tags if (!isLowercase && !isMixedCase) { // manual tagger addTokens(manualLowerTaggerTokens, l); // tagger dictionary if (manualLowerTaggerTokens.isEmpty()) { addTokens(asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)), l); } } // additional tagging with prefixes if (l.isEmpty() && !isMixedCase) { addTokens(additionalTags(word), l); } if (l.isEmpty()) { l.add(new AnalyzedToken(word, null, null)); } tokenReadings.add(new AnalyzedTokenReadings(l, pos)); pos += word.length(); } return tokenReadings; }
@Override public List<AnalyzedToken> additionalTags(String word) { final IStemmer dictLookup; try { dictLookup = new DictionaryLookup(getDictionary()); } catch (IOException e) { throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e); } List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>(); // Any well-formed adverb with suffix -ment is tagged as an adverb (RG) // Adjectiu femení singular o participi femení singular + -ment if (word.endsWith("ment")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1"); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = ADJ_PART_FS.matcher(posTag); if (m.matches()) { additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord)); return additionalTaggedTokens; } } } } // Any well-formed verb with prefixes is tagged as a verb copying the original tags Matcher matcher = PREFIXES_FOR_VERBS.matcher(word); if (matcher.matches()) { final String possibleVerb = matcher.group(2).toLowerCase(); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = VERB.matcher(posTag); if (m.matches()) { String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma()); additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma)); } } } return additionalTaggedTokens; } // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags if (word.startsWith("ex")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1"); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = NOUN.matcher(posTag); if (m.matches()) { String lemma = "ex".concat(taggerToken.getLemma()); additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma)); } } } return additionalTaggedTokens; } // Interpret deprecated characters of "ela geminada" // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT if (word.contains("\u0140") || word.contains("\u013f")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleWord = lowerWord.replaceAll("\u0140", "l·"); List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord)); return taggerTokens; } return null; }