/** * @param args a map with values for {@code year}, {@code month}, {@code day} (day of month), * {@code weekDay} */ @Override public RuleMatch acceptRuleMatch( RuleMatch match, Map<String, String> args, AnalyzedTokenReadings[] patternTokens) { int dayOfWeekFromString = getDayOfWeek(getRequired("weekDay", args)); Calendar dateFromDate = getDate(args); int dayOfWeekFromDate; try { dayOfWeekFromDate = dateFromDate.get(Calendar.DAY_OF_WEEK); } catch (IllegalArgumentException ignore) { // happens with 'dates' like '32.8.2014' - those should be caught by a different rule return null; } if (dayOfWeekFromString != dayOfWeekFromDate) { String realDayName = getDayOfWeek(dateFromDate); String message = match.getMessage().replace("\\realDay", realDayName); RuleMatch newMatch = new RuleMatch( match.getRule(), match.getFromPos(), match.getToPos(), message, match.getShortMessage()); return newMatch; } else { return null; } }
public static void main(String[] args) throws TwitterException, IOException { Twitter twitter = TwitterFactory.getSingleton(); JLanguageTool langTool = new JLanguageTool(new AmericanEnglish()); List<String> twts = new ArrayList<String>(); for (String arg : args) { Query query = new Query(arg); QueryResult result; int counter = 0; do { result = twitter.search(query); List<Status> tweets = result.getTweets(); for (Status tweet : tweets) { if (isEligible(tweet)) { System.out.println("@" + tweet.getUser().getScreenName() + " - " + tweet.getText()); System.out.println(tweet.getLang()); twts.add(tweet.getText()); counter++; } } } while ((query = result.nextQuery()) != null && counter < 5); } for (String str : twts) { List<RuleMatch> matches = langTool.check(str); for (RuleMatch match : matches) { System.out.println( "Potential error at line " + match.getLine() + ", column " + match.getColumn() + ": " + match.getMessage()); System.out.println("Suggested correction: " + match.getSuggestedReplacements()); } } }
private void assertBad(String s, String... expectedSuggestions) throws IOException { RuleMatch[] matches = rule.match(langTool.getAnalyzedSentence(s)); assertEquals("Did not find one match in sentence '" + s + "'", 1, matches.length); if (expectedSuggestions.length > 0) { RuleMatch match = matches[0]; List<String> suggestions = match.getSuggestedReplacements(); assertThat(suggestions, is(Arrays.asList(expectedSuggestions))); } }
private RuleMatch ruleMatchWrongVerbSubject( final AnalyzedTokenReadings subject, final AnalyzedTokenReadings verb, final String expectedVerbPOS) { final String msg = "Möglicherweise fehlende grammatische Übereinstimmung zwischen Subjekt (" + subject.getToken() + ") und Prädikat (" + verb.getToken() + ") bezüglich Person oder Numerus (Einzahl, Mehrzahl - Beispiel: " + "'ich sind' statt 'ich bin')."; List<String> suggestions = new ArrayList<>(); List<String> verbSuggestions = new ArrayList<>(); List<String> pronounSuggestions = new ArrayList<>(); RuleMatch ruleMatch; if (subject.getStartPos() < verb.getStartPos()) { ruleMatch = new RuleMatch( this, subject.getStartPos(), verb.getStartPos() + verb.getToken().length(), msg); verbSuggestions.addAll(getVerbSuggestions(verb, expectedVerbPOS, false)); for (String verbSuggestion : verbSuggestions) { suggestions.add(subject.getToken() + " " + verbSuggestion); } pronounSuggestions.addAll( getPronounSuggestions(verb, Character.isUpperCase(subject.getToken().charAt(0)))); for (String pronounSuggestion : pronounSuggestions) { suggestions.add(pronounSuggestion + " " + verb.getToken()); } ruleMatch.setSuggestedReplacements(suggestions); } else { ruleMatch = new RuleMatch( this, verb.getStartPos(), subject.getStartPos() + subject.getToken().length(), msg); verbSuggestions.addAll( getVerbSuggestions( verb, expectedVerbPOS, Character.isUpperCase(verb.getToken().charAt(0)))); for (String verbSuggestion : verbSuggestions) { suggestions.add(verbSuggestion + " " + subject.getToken()); } pronounSuggestions.addAll(getPronounSuggestions(verb, false)); for (String pronounSuggestion : pronounSuggestions) { suggestions.add(verb.getToken() + " " + pronounSuggestion); } ruleMatch.setSuggestedReplacements(suggestions); } return ruleMatch; }
private void assertBad(String s, int n, String... expectedSuggestions) throws IOException { RuleMatch[] matches = rule.match(langTool.getAnalyzedSentence(s)); assertEquals("Did not find " + n + " match(es) in sentence '" + s + "'", n, matches.length); if (expectedSuggestions.length > 0) { RuleMatch match = matches[0]; // When two errors are reported by the rule (so TODO above), it might happen that the first // match does not have the suggestions, but the second one if (matches.length > 1 && match.getSuggestedReplacements().size() == 0) { match = matches[1]; } List<String> suggestions = match.getSuggestedReplacements(); assertThat(suggestions, is(Arrays.asList(expectedSuggestions))); } }
@Override public RuleMatch[] match(AnalyzedSentence sentence) { final List<RuleMatch> ruleMatches = new ArrayList<>(); final AnalyzedTokenReadings[] tokens = sentence.getTokens(); boolean foundSpiegelt = false; boolean foundWieder = false; boolean foundWider = false; for (int i = 0; i < tokens.length; i++) { final String token = tokens[i].getToken(); if (!token.trim().equals("")) { if (token.equalsIgnoreCase("spiegelt") || token.equalsIgnoreCase("spiegeln") || token.equalsIgnoreCase("spiegelte") || token.equalsIgnoreCase("spiegelten") || token.equalsIgnoreCase("spiegelst")) { foundSpiegelt = true; } else if (token.equalsIgnoreCase("wieder") && foundSpiegelt) { foundWieder = true; } else if (token.equalsIgnoreCase("wider") && foundSpiegelt) { foundWider = true; } if (foundSpiegelt && foundWieder && !foundWider && !(tokens.length > i + 2 && (tokens[i + 1].getToken().equals("wider") || tokens[i + 2].getToken().equals("wider")))) { final String shortMsg = "'wider' in 'widerspiegeln' wird mit 'i' geschrieben"; final String msg = "'wider' in 'widerspiegeln' wird mit 'i' statt mit 'ie' " + "geschrieben, z.B. 'Das spiegelt die Situation gut wider.'"; final int pos = tokens[i].getStartPos(); final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), msg, shortMsg); ruleMatch.setSuggestedReplacement("wider"); ruleMatches.add(ruleMatch); foundSpiegelt = false; foundWieder = false; foundWider = false; } } } return toRuleMatchArray(ruleMatches); }
@Override public RuleMatch[] match(AnalyzedSentence sentence) throws IOException { final List<RuleMatch> ruleMatches = new ArrayList<>(); if (needsInit) { init(); } if (hunspellDict == null) { // some languages might not have a dictionary, be silent about it return toRuleMatchArray(ruleMatches); } final String[] tokens = tokenizeText(getSentenceTextWithoutUrlsAndImmunizedTokens(sentence)); // starting with the first token to skip the zero-length START_SENT int len = sentence.getTokens()[1].getStartPos(); for (int i = 0; i < tokens.length; i++) { String word = tokens[i]; if (ignoreWord(Arrays.asList(tokens), i) || ignoreWord(word)) { len += word.length() + 1; continue; } if (isMisspelled(word)) { final RuleMatch ruleMatch = new RuleMatch( this, len, len + word.length(), messages.getString("spelling"), messages.getString("desc_spelling_short")); final List<String> suggestions = getSuggestions(word); suggestions.addAll(0, getAdditionalTopSuggestions(suggestions, word)); suggestions.addAll(getAdditionalSuggestions(suggestions, word)); if (!suggestions.isEmpty()) { filterSuggestions(suggestions); ruleMatch.setSuggestedReplacements(suggestions); } ruleMatches.add(ruleMatch); } len += word.length() + 1; } return toRuleMatchArray(ruleMatches); }
@Override public RuleMatch[] match(final AnalyzedSentence sentence) { final List<RuleMatch> ruleMatches = new ArrayList<>(); final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); String prevToken = null; for (int i = 0; i < tokens.length; i++) { final String token = tokens[i].getToken(); if (tokens[i].isWhitespace()) { // ignore continue; } if (prevToken != null && !prevToken.equals("-") && !prevToken.contains("--") && !prevToken.contains( "–-") // first char is some special kind of dash, found in Wikipedia && prevToken.endsWith("-")) { final char firstChar = token.charAt(0); if (Character.isUpperCase(firstChar)) { final String msg = "Möglicherweise fehlt ein 'und' oder ein Komma, oder es wurde nach dem Wort " + "ein überflüssiges Leerzeichen eingefügt. Eventuell haben Sie auch versehentlich einen Bindestrich statt eines Punktes eingefügt."; final RuleMatch ruleMatch = new RuleMatch( this, tokens[i - 1].getStartPos(), tokens[i - 1].getStartPos() + prevToken.length() + 1, msg); ruleMatch.setSuggestedReplacement(tokens[i - 1].getToken()); ruleMatches.add(ruleMatch); } } prevToken = token; } return toRuleMatchArray(ruleMatches); }
@Override public final RuleMatch[] match(final AnalyzedSentence text) { final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); if (tokens.length < 2) { return toRuleMatchArray(ruleMatches); } int matchTokenPos = 1; // 0 = SENT_START final String firstToken = tokens[matchTokenPos].getToken(); String secondToken = null; String thirdToken = null; // ignore quote characters: if (tokens.length >= 3 && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) { matchTokenPos = 2; secondToken = tokens[matchTokenPos].getToken(); } final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens); if (firstDutchToken != null) { thirdToken = firstDutchToken; matchTokenPos = 3; } String checkToken = firstToken; if (thirdToken != null) { checkToken = thirdToken; } else if (secondToken != null) { checkToken = secondToken; } String lastToken = tokens[tokens.length - 1].getToken(); if (lastToken.matches("[ \"'„»«“]") && tokens.length >= 2) { // ignore trailing whitespace or quote lastToken = tokens[tokens.length - 2].getToken(); } boolean preventError = false; // TODO: why do only *these* languages have that special case? final String langCode = language.getShortName(); final boolean languageHasSpecialCases = langCode.equals("ru") || langCode.equals("pl") || langCode.equals("uk") || langCode.equals("be") || langCode.equals(Locale.ENGLISH.getLanguage()) || langCode.equals(Locale.ITALIAN.getLanguage()) || langCode.equals(Locale.GERMAN.getLanguage()); if (languageHasSpecialCases) { // fix for lists; note - this will not always work for the last point in OOo, // as OOo might serve paragraphs in any order. if (";".equals(lastParagraphString) || ";".equals(lastToken) || ",".equals(lastParagraphString) || ",".equals(lastToken)) { preventError = true; } // fix for words in table (not sentences); note - this will not always work for the last point // in OOo, // as OOo might serve paragraphs in any order. if (!lastToken.matches("[.?!…]")) { preventError = true; } } lastParagraphString = lastToken; if (checkToken.length() > 0) { final char firstChar = checkToken.charAt(0); if (!preventError && Character.isLowerCase(firstChar)) { final RuleMatch ruleMatch = new RuleMatch( this, tokens[matchTokenPos].getStartPos(), tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(), messages.getString("incorrect_case")); ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken)); ruleMatches.add(ruleMatch); } } return toRuleMatchArray(ruleMatches); }
@Override public final RuleMatch[] match(final AnalyzedSentence sentence) { final List<RuleMatch> ruleMatches = new ArrayList<>(); final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); if (tokens.length < 2) { return toRuleMatchArray(ruleMatches); } int matchTokenPos = 1; // 0 = SENT_START final String firstToken = tokens[matchTokenPos].getToken(); String secondToken = null; String thirdToken = null; // ignore quote characters: if (tokens.length >= 3 && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) { matchTokenPos = 2; secondToken = tokens[matchTokenPos].getToken(); } final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens); if (firstDutchToken != null) { thirdToken = firstDutchToken; matchTokenPos = 3; } String checkToken = firstToken; if (thirdToken != null) { checkToken = thirdToken; } else if (secondToken != null) { checkToken = secondToken; } String lastToken = tokens[tokens.length - 1].getToken(); if (tokens.length >= 2 && WHITESPACE_OR_QUOTE.matcher(lastToken).matches()) { // ignore trailing whitespace or quote lastToken = tokens[tokens.length - 2].getToken(); } boolean preventError = false; if (lastParagraphString.equals(",") || lastParagraphString.equals(";")) { preventError = true; } if (!SENTENCE_END1.matcher(lastParagraphString).matches() && !SENTENCE_END2.matcher(lastToken).matches()) { preventError = true; } lastParagraphString = lastToken; // allows enumeration with lowercase letters: a), iv., etc. if (matchTokenPos + 1 < tokens.length && NUMERALS_EN.matcher(tokens[matchTokenPos].getToken()).matches() && (tokens[matchTokenPos + 1].getToken().equals(".") || tokens[matchTokenPos + 1].getToken().equals(")"))) { preventError = true; } if (isUrl(checkToken)) { preventError = true; } if (checkToken.length() > 0) { final char firstChar = checkToken.charAt(0); if (!preventError && Character.isLowerCase(firstChar)) { final RuleMatch ruleMatch = new RuleMatch( this, tokens[matchTokenPos].getStartPos(), tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(), messages.getString("incorrect_case")); ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken)); ruleMatches.add(ruleMatch); } } return toRuleMatchArray(ruleMatches); }
@Override public RuleMatch[] match(final AnalyzedSentence sentence) { final List<RuleMatch> ruleMatches = new ArrayList<>(); final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); RuleMatch prevRuleMatch = null; final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS); for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) { final AnalyzedTokenReadings token; // we need to extend the token list so we find matches at the end of the original list: if (i >= tokens.length) { token = new AnalyzedTokenReadings( new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); } else { token = tokens[i]; } if (i == 0) { addToQueue(token, prevTokens); continue; } final StringBuilder sb = new StringBuilder(); int j = 0; AnalyzedTokenReadings firstMatchToken = null; final List<String> stringsToCheck = new ArrayList<>(); final List<String> origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>(); for (AnalyzedTokenReadings atr : prevTokens) { if (j == 0) { firstMatchToken = atr; } sb.append(' '); sb.append(atr.getToken()); if (j >= 1) { final String stringToCheck = normalize(sb.toString()); stringsToCheck.add(stringToCheck); origStringsToCheck.add(sb.toString().trim()); if (!stringToToken.containsKey(stringToCheck)) { stringToToken.put(stringToCheck, atr); } } j++; } // iterate backwards over all potentially incorrect strings to make // sure we match longer strings first: for (int k = stringsToCheck.size() - 1; k >= 0; k--) { final String stringToCheck = stringsToCheck.get(k); final String origStringToCheck = origStringsToCheck.get(k); if (incorrectCompounds.contains(stringToCheck)) { final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); String msg = null; final List<String> replacement = new ArrayList<>(); if (!noDashSuggestion.contains(stringToCheck)) { replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } if (isNotAllUppercase(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) { replacement.add(mergeCompound(origStringToCheck)); msg = withoutHyphenMessage; } final String[] parts = stringToCheck.split(" "); if (parts.length > 0 && parts[0].length() == 1) { replacement.clear(); replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen msg = withOrWithoutHyphenMessage; } final RuleMatch ruleMatch = new RuleMatch( this, firstMatchToken.getStartPos(), atr.getStartPos() + atr.getToken().length(), msg, shortDesc); // avoid duplicate matches: if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { prevRuleMatch = ruleMatch; break; } prevRuleMatch = ruleMatch; ruleMatch.setSuggestedReplacements(replacement); ruleMatches.add(ruleMatch); break; } } addToQueue(token, prevTokens); } return toRuleMatchArray(ruleMatches); }
/** Compare by start position. */ @Override public int compareTo(final RuleMatch other) { Objects.requireNonNull(other); return Integer.compare(getFromPos(), other.getFromPos()); }
@Override public RuleMatch[] match(AnalyzedSentence sentence) { List<RuleMatch> ruleMatches = new ArrayList<>(); AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); RuleMatch prevRuleMatch = null; Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS); for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) { AnalyzedTokenReadings token; // we need to extend the token list so we find matches at the end of the original list: if (i >= tokens.length) { token = new AnalyzedTokenReadings( new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); } else { token = tokens[i]; } if (i == 0) { addToQueue(token, prevTokens); continue; } if (token.isImmunized()) { continue; } AnalyzedTokenReadings firstMatchToken = prevTokens.peek(); List<String> stringsToCheck = new ArrayList<>(); List<String> origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck); // iterate backwards over all potentially incorrect strings to make // sure we match longer strings first: for (int k = stringsToCheck.size() - 1; k >= 0; k--) { String stringToCheck = stringsToCheck.get(k); String origStringToCheck = origStringsToCheck.get(k); if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) { AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); String msg = null; List<String> replacement = new ArrayList<>(); if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) { replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) { replacement.add(mergeCompound(origStringToCheck)); msg = withoutHyphenMessage; } String[] parts = stringToCheck.split(" "); if (parts.length > 0 && parts[0].length() == 1) { replacement.clear(); replacement.add(origStringToCheck.replace(' ', '-')); msg = withHyphenMessage; } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen msg = withOrWithoutHyphenMessage; } RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc); ruleMatch.setSuggestedReplacements(replacement); // avoid duplicate matches: if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { prevRuleMatch = ruleMatch; break; } prevRuleMatch = ruleMatch; ruleMatches.add(ruleMatch); break; } } addToQueue(token, prevTokens); } return toRuleMatchArray(ruleMatches); }