/** Returns true iff the token at the given position should be ignored by the spell checker. */ protected boolean ignoreToken(AnalyzedTokenReadings[] tokens, int idx) throws IOException { List<String> words = new ArrayList<>(); for (AnalyzedTokenReadings token : tokens) { words.add(token.getToken()); } return ignoreWord(words, idx); }
/** * @return false if neither the verb @param token1 (if any) nor @param token2 match @param person * and @param number, and none of them is "und" or "," if a finite verb is found, it is saved * in finiteVerb */ private boolean verbDoesMatchPersonAndNumber( final AnalyzedTokenReadings token1, final AnalyzedTokenReadings token2, final String person, final String number) { if (token1.getToken().equals(",") || token1.getToken().equals("und") || token2.getToken().equals(",") || token2.getToken().equals("und")) { return true; } boolean foundFiniteVerb = false; if (isFiniteVerb(token1)) { foundFiniteVerb = true; finiteVerb = token1; if (token1.hasPartialPosTag(":" + person + ":" + number)) { return true; } } if (isFiniteVerb(token2)) { foundFiniteVerb = true; finiteVerb = token2; if (token2.hasPartialPosTag(":" + person + ":" + number)) { return true; } } return !foundFiniteVerb; }
private RuleMatch ruleMatchWrongVerb(final AnalyzedTokenReadings token) { final String msg = "Möglicherweise fehlende grammatische Übereinstimmung zwischen Subjekt und Prädikat (" + token.getToken() + ") bezüglich Person oder Numerus (Einzahl, Mehrzahl - Beispiel: " + "'Max bist' statt 'Max ist')."; return new RuleMatch( this, token.getStartPos(), token.getStartPos() + token.getToken().length(), msg); }
@Nullable private String baseForThirdPersonSingularVerb(String word) throws IOException { List<AnalyzedTokenReadings> readings = tagger.tag(Collections.singletonList(word)); for (AnalyzedTokenReadings reading : readings) { if (reading.hasPartialPosTag("VER:3:SIN:")) { return reading.getReadings().get(0).getLemma(); } } return null; }
private static String getPOS(final AnalyzedTokenReadings atr) { final StringBuilder sb = new StringBuilder(); final int readNum = atr.getReadingsLength(); for (int i = 0; i < readNum; i++) { if (!atr.isWhitespace()) { sb.append(atr.getAnalyzedToken(i).getPOSTag()); if (i != readNum - 1) { sb.append('+'); } } } return sb.toString(); }
@Override public RuleMatch[] match(final AnalyzedSentence text) { final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); if (tokens.length > 3) { final AnalyzedTokenReadings analyzedToken = tokens[1]; final String token = analyzedToken.getToken(); // avoid "..." etc. to be matched: boolean isWord = true; if (token.length() == 1) { final char c = token.charAt(0); if (!Character.isLetter(c)) { isWord = false; } } if (isWord && lastToken.equals(token) && !isException(token) && !isException(tokens[2].getToken()) && !isException(tokens[3].getToken())) { final String shortMsg; if (isAdverb(analyzedToken)) { shortMsg = messages.getString("desc_repetition_beginning_adv"); } else if (beforeLastToken.equals(token)) { shortMsg = messages.getString("desc_repetition_beginning_word"); } else { shortMsg = ""; } if (!shortMsg.equals("")) { final String msg = shortMsg + " " + messages.getString("desc_repetition_beginning_thesaurus"); final int startPos = analyzedToken.getStartPos(); final int endPos = startPos + token.length(); final RuleMatch ruleMatch = new RuleMatch(this, startPos, endPos, msg, shortMsg); ruleMatches.add(ruleMatch); } } beforeLastToken = lastToken; lastToken = token; } // TODO should we ignore repetitions involving multiple paragraphs? // if (tokens[tokens.length - 1].isParaEnd()) beforeLastToken = ""; return toRuleMatchArray(ruleMatches); }
/** * @return true if the verb @param token (if it is a verb) matches @param person and @param * number, and matches no other person/number */ private boolean hasUnambiguouslyPersonAndNumber( final AnalyzedTokenReadings tokenReadings, final String person, final String number) { if (tokenReadings.getToken().length() == 0 || (Character.isUpperCase(tokenReadings.getToken().charAt(0)) && !(tokenReadings.getStartPos() == 0)) || !tokenReadings.hasPartialPosTag("VER")) return false; for (AnalyzedToken analyzedToken : tokenReadings) { final String postag = analyzedToken.getPOSTag(); if (postag.contains("_END")) // ignore SENT_END and PARA_END continue; if (!postag.contains(":" + person + ":" + number)) return false; } // for each reading return true; }
@Override protected boolean isAdverb(final AnalyzedTokenReadings token) { if (ADVERBS.contains(token.getToken())) { return true; } return false; }
/** * @return a list of forms of @param verb which match @param expectedVerbPOS (person:number) * @param toUppercase true when the suggestions should be capitalized */ private List<String> getVerbSuggestions( final AnalyzedTokenReadings verb, final String expectedVerbPOS, final boolean toUppercase) { // find the first verb reading AnalyzedToken verbToken = new AnalyzedToken("", "", ""); for (AnalyzedToken token : verb.getReadings()) { if (token.getPOSTag().startsWith("VER:")) { verbToken = token; break; } } try { String[] synthesized = german.getSynthesizer().synthesize(verbToken, "VER.*:" + expectedVerbPOS + ".*", true); // remove duplicates Set<String> suggestionSet = new HashSet<>(); suggestionSet.addAll(Arrays.asList(synthesized)); List<String> suggestions = new ArrayList<>(); suggestions.addAll(suggestionSet); if (toUppercase) { for (int i = 0; i < suggestions.size(); ++i) { suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i))); } } Collections.sort(suggestions); return suggestions; } catch (IOException e) { throw new RuntimeException(e); } }
/** @return true if @param token is a finite verb, and it is no participle, pronoun or number */ private boolean isFiniteVerb(final AnalyzedTokenReadings token) { if (token.getToken().length() == 0 || (Character.isUpperCase(token.getToken().charAt(0)) && token.getStartPos() != 0) || !token.hasPartialPosTag("VER") || token.hasPartialPosTag("PA2") || token.hasPartialPosTag("PRO:") || token.hasPartialPosTag("ZAL")) { return false; } return (token.hasPartialPosTag(":1:") || token.hasPartialPosTag(":2:") || token.hasPartialPosTag(":3:")); }
private RuleMatch ruleMatchWrongVerbSubject( final AnalyzedTokenReadings subject, final AnalyzedTokenReadings verb) { final String msg = "Möglicherweise fehlende grammatische Übereinstimmung zwischen Subjekt (" + subject.getToken() + ") und Prädikat (" + verb.getToken() + ") bezüglich Person oder Numerus (Einzahl, Mehrzahl - Beispiel: " + "'ich sind' statt 'ich bin')."; if (subject.getStartPos() < verb.getStartPos()) { return new RuleMatch( this, subject.getStartPos(), verb.getStartPos() + verb.getToken().length(), msg); } else { return new RuleMatch( this, verb.getStartPos(), subject.getStartPos() + subject.getToken().length(), msg); } }
@Override protected boolean isTagged(AnalyzedTokenReadings tokenReadings) { for (AnalyzedToken token : tokenReadings.getReadings()) { String posTag = token.getPOSTag(); if (isGoodPosTag(posTag)) { return true; } } return false; }
/** * @return a list of pronouns which match the person and number of @param verb * @param toUppercase true when the suggestions should be capitalized */ private List<String> getPronounSuggestions( final AnalyzedTokenReadings verb, final boolean toUppercase) { List<String> result = new ArrayList<>(); if (verb.hasPartialPosTag(":1:SIN")) { result.add("ich"); } if (verb.hasPartialPosTag(":2:SIN")) { result.add("du"); } if (verb.hasPartialPosTag(":3:SIN")) { result.add("er"); result.add("sie"); result.add("es"); } if (verb.hasPartialPosTag(":1:PLU")) { result.add("wir"); } if (verb.hasPartialPosTag(":2:PLU")) { result.add("ihr"); } if (verb.hasPartialPosTag(":3:PLU") && !result.contains("sie")) { // do not add "sie" twice result.add("sie"); } if (toUppercase) { for (int i = 0; i < result.size(); ++i) { result.set(i, StringTools.uppercaseFirstChar(result.get(i))); } } return result; }
private Map<String, AnalyzedTokenReadings> getStringToTokenMap( Queue<AnalyzedTokenReadings> prevTokens, List<String> stringsToCheck, List<String> origStringsToCheck) { StringBuilder sb = new StringBuilder(); Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>(); int j = 0; for (AnalyzedTokenReadings atr : prevTokens) { sb.append(' '); sb.append(atr.getToken()); if (j >= 1) { String stringToCheck = normalize(sb.toString()); stringsToCheck.add(stringToCheck); origStringsToCheck.add(sb.toString().trim()); if (!stringToToken.containsKey(stringToCheck)) { stringToToken.put(stringToCheck, atr); } } j++; } return stringToToken; }
@Override public RuleMatch[] match(AnalyzedSentence sentence) { List<RuleMatch> ruleMatches = new ArrayList<>(); AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); RuleMatch prevRuleMatch = null; Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS); for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) { AnalyzedTokenReadings token; // we need to extend the token list so we find matches at the end of the original list: if (i >= tokens.length) { token = new AnalyzedTokenReadings( new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); } else { token = tokens[i]; } if (i == 0) { addToQueue(token, prevTokens); continue; } if (token.isImmunized()) { continue; } AnalyzedTokenReadings firstMatchToken = prevTokens.peek(); List<String> stringsToCheck = new ArrayList<>(); List<String> origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck); // iterate backwards over all potentially incorrect strings to make // sure we match longer strings first: for (int k = stringsToCheck.size() - 1; k >= 0; k--) { String stringToCheck = stringsToCheck.get(k); String origStringToCheck = origStringsToCheck.get(k); if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) { AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); String msg = null; List<String> replacement = new ArrayList<>(); if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) { replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) { replacement.add(mergeCompound(origStringToCheck)); msg = withoutHyphenMessage; } String[] parts = stringToCheck.split(" "); if (parts.length > 0 && parts[0].length() == 1) { replacement.clear(); replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen msg = withOrWithoutHyphenMessage; } RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc); ruleMatch.setSuggestedReplacements(replacement); // avoid duplicate matches: if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { prevRuleMatch = ruleMatch; break; } prevRuleMatch = ruleMatch; ruleMatches.add(ruleMatch); break; } } addToQueue(token, prevTokens); } return toRuleMatchArray(ruleMatches); }
@Override public RuleMatch[] match(final AnalyzedSentence sentence) { final List<RuleMatch> ruleMatches = new ArrayList<>(); final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); RuleMatch prevRuleMatch = null; final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS); for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) { final AnalyzedTokenReadings token; // we need to extend the token list so we find matches at the end of the original list: if (i >= tokens.length) { token = new AnalyzedTokenReadings( new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); } else { token = tokens[i]; } if (i == 0) { addToQueue(token, prevTokens); continue; } final StringBuilder sb = new StringBuilder(); int j = 0; AnalyzedTokenReadings firstMatchToken = null; final List<String> stringsToCheck = new ArrayList<>(); final List<String> origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>(); for (AnalyzedTokenReadings atr : prevTokens) { if (j == 0) { firstMatchToken = atr; } sb.append(' '); sb.append(atr.getToken()); if (j >= 1) { final String stringToCheck = normalize(sb.toString()); stringsToCheck.add(stringToCheck); origStringsToCheck.add(sb.toString().trim()); if (!stringToToken.containsKey(stringToCheck)) { stringToToken.put(stringToCheck, atr); } } j++; } // iterate backwards over all potentially incorrect strings to make // sure we match longer strings first: for (int k = stringsToCheck.size() - 1; k >= 0; k--) { final String stringToCheck = stringsToCheck.get(k); final String origStringToCheck = origStringsToCheck.get(k); if (incorrectCompounds.contains(stringToCheck)) { final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); String msg = null; final List<String> replacement = new ArrayList<>(); if (!noDashSuggestion.contains(stringToCheck)) { replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } if (isNotAllUppercase(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) { replacement.add(mergeCompound(origStringToCheck)); msg = withoutHyphenMessage; } final String[] parts = stringToCheck.split(" "); if (parts.length > 0 && parts[0].length() == 1) { replacement.clear(); replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen msg = withOrWithoutHyphenMessage; } final RuleMatch ruleMatch = new RuleMatch( this, firstMatchToken.getStartPos(), atr.getStartPos() + atr.getToken().length(), msg, shortDesc); // avoid duplicate matches: if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { prevRuleMatch = ruleMatch; break; } prevRuleMatch = ruleMatch; ruleMatch.setSuggestedReplacements(replacement); ruleMatches.add(ruleMatch); break; } } addToQueue(token, prevTokens); } return toRuleMatchArray(ruleMatches); }
private RuleMatch ruleMatchWrongVerbSubject( final AnalyzedTokenReadings subject, final AnalyzedTokenReadings verb, final String expectedVerbPOS) { final String msg = "Möglicherweise fehlende grammatische Übereinstimmung zwischen Subjekt (" + subject.getToken() + ") und Prädikat (" + verb.getToken() + ") bezüglich Person oder Numerus (Einzahl, Mehrzahl - Beispiel: " + "'ich sind' statt 'ich bin')."; List<String> suggestions = new ArrayList<>(); List<String> verbSuggestions = new ArrayList<>(); List<String> pronounSuggestions = new ArrayList<>(); RuleMatch ruleMatch; if (subject.getStartPos() < verb.getStartPos()) { ruleMatch = new RuleMatch( this, subject.getStartPos(), verb.getStartPos() + verb.getToken().length(), msg); verbSuggestions.addAll(getVerbSuggestions(verb, expectedVerbPOS, false)); for (String verbSuggestion : verbSuggestions) { suggestions.add(subject.getToken() + " " + verbSuggestion); } pronounSuggestions.addAll( getPronounSuggestions(verb, Character.isUpperCase(subject.getToken().charAt(0)))); for (String pronounSuggestion : pronounSuggestions) { suggestions.add(pronounSuggestion + " " + verb.getToken()); } ruleMatch.setSuggestedReplacements(suggestions); } else { ruleMatch = new RuleMatch( this, verb.getStartPos(), subject.getStartPos() + subject.getToken().length(), msg); verbSuggestions.addAll( getVerbSuggestions( verb, expectedVerbPOS, Character.isUpperCase(verb.getToken().charAt(0)))); for (String verbSuggestion : verbSuggestions) { suggestions.add(verbSuggestion + " " + subject.getToken()); } pronounSuggestions.addAll(getPronounSuggestions(verb, false)); for (String pronounSuggestion : pronounSuggestions) { suggestions.add(verb.getToken() + " " + pronounSuggestion); } ruleMatch.setSuggestedReplacements(suggestions); } return ruleMatch; }
private boolean isQuotationMark(final AnalyzedTokenReadings token) { return QUOTATION_MARKS.contains(token.getToken()); }