/** * @return a list of forms of @param verb which match @param expectedVerbPOS (person:number) * @param toUppercase true when the suggestions should be capitalized */ private List<String> getVerbSuggestions( final AnalyzedTokenReadings verb, final String expectedVerbPOS, final boolean toUppercase) { // find the first verb reading AnalyzedToken verbToken = new AnalyzedToken("", "", ""); for (AnalyzedToken token : verb.getReadings()) { if (token.getPOSTag().startsWith("VER:")) { verbToken = token; break; } } try { String[] synthesized = german.getSynthesizer().synthesize(verbToken, "VER.*:" + expectedVerbPOS + ".*", true); // remove duplicates Set<String> suggestionSet = new HashSet<>(); suggestionSet.addAll(Arrays.asList(synthesized)); List<String> suggestions = new ArrayList<>(); suggestions.addAll(suggestionSet); if (toUppercase) { for (int i = 0; i < suggestions.size(); ++i) { suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i))); } } Collections.sort(suggestions); return suggestions; } catch (IOException e) { throw new RuntimeException(e); } }
@Override protected boolean isTagged(AnalyzedTokenReadings tokenReadings) { for (AnalyzedToken token : tokenReadings.getReadings()) { String posTag = token.getPOSTag(); if (isGoodPosTag(posTag)) { return true; } } return false; }
/** * @return true if the verb @param token (if it is a verb) matches @param person and @param * number, and matches no other person/number */ private boolean hasUnambiguouslyPersonAndNumber( final AnalyzedTokenReadings tokenReadings, final String person, final String number) { if (tokenReadings.getToken().length() == 0 || (Character.isUpperCase(tokenReadings.getToken().charAt(0)) && !(tokenReadings.getStartPos() == 0)) || !tokenReadings.hasPartialPosTag("VER")) return false; for (AnalyzedToken analyzedToken : tokenReadings) { final String postag = analyzedToken.getPOSTag(); if (postag.contains("_END")) // ignore SENT_END and PARA_END continue; if (!postag.contains(":" + person + ":" + number)) return false; } // for each reading return true; }
@Override public List<AnalyzedToken> additionalTags(String word) { final IStemmer dictLookup; try { dictLookup = new DictionaryLookup(getDictionary()); } catch (IOException e) { throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e); } List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>(); // Any well-formed adverb with suffix -ment is tagged as an adverb (RG) // Adjectiu femení singular o participi femení singular + -ment if (word.endsWith("ment")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1"); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = ADJ_PART_FS.matcher(posTag); if (m.matches()) { additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord)); return additionalTaggedTokens; } } } } // Any well-formed verb with prefixes is tagged as a verb copying the original tags Matcher matcher = PREFIXES_FOR_VERBS.matcher(word); if (matcher.matches()) { final String possibleVerb = matcher.group(2).toLowerCase(); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = VERB.matcher(posTag); if (m.matches()) { String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma()); additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma)); } } } return additionalTaggedTokens; } // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags if (word.startsWith("ex")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1"); List<AnalyzedToken> taggerTokens; taggerTokens = asAnalyzedTokenList(possibleNoun, dictLookup.lookup(possibleNoun)); for (AnalyzedToken taggerToken : taggerTokens) { final String posTag = taggerToken.getPOSTag(); if (posTag != null) { final Matcher m = NOUN.matcher(posTag); if (m.matches()) { String lemma = "ex".concat(taggerToken.getLemma()); additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma)); } } } return additionalTaggedTokens; } // Interpret deprecated characters of "ela geminada" // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT if (word.contains("\u0140") || word.contains("\u013f")) { final String lowerWord = word.toLowerCase(conversionLocale); final String possibleWord = lowerWord.replaceAll("\u0140", "l·"); List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord)); return taggerTokens; } return null; }