protected void finalizeExceptions() { inException = false; if (!exceptionSet) { tokenElement = new Element( StringTools.trimWhitespace(elements.toString()), caseSensitive, regExpression, tokenInflected); exceptionSet = true; } tokenElement.setNegation(tokenNegated); if (!StringTools.isEmpty(exceptions.toString()) || exceptionPosToken != null) { tokenElement.setStringPosException( StringTools.trimWhitespace(exceptions.toString()), exceptionStringRegExp, exceptionStringInflected, exceptionStringNegation, exceptionValidNext, exceptionValidPrev, exceptionPosToken, exceptionPosRegExp, exceptionPosNegation); exceptionPosToken = null; } if (exceptionSpaceBeforeSet) { tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore); } resetException(); }
private boolean urlEndsAt(int i, List<String> l, String urlQuote) { String token = l.get(i); if (StringTools.isWhitespace(token)) { return true; } else if (token.equals(")") || token.equals("]")) { // this is guesswork return true; } else if (l.size() > i + 1) { String nToken = l.get(i + 1); if (StringTools.isWhitespace(nToken) && (token.equals(".") || token.equals(",") || token.equals(";") || token.equals(":") || token.equals("!") || token.equals("?") || token.equals(urlQuote))) { return true; } } else { Matcher matcher = URL_CHARS.matcher(token); if (!matcher.matches()) { return true; } } return false; }
@Override protected List<String> getAdditionalTopSuggestions(List<String> suggestions, String word) { String w = word.replaceFirst("\\.$", ""); if ("unzwar".equals(w)) { return Collections.singletonList("und zwar"); } else if ("desweiteren".equals(w)) { return Collections.singletonList("des Weiteren"); } else if ("wieviel".equals(w)) { return Collections.singletonList("wie viel"); } else if ("wieviele".equals(w)) { return Collections.singletonList("wie viele"); } else if ("wievielen".equals(w)) { return Collections.singletonList("wie vielen"); } else if ("vorteilen".equals(w)) { return Collections.singletonList("Vorteilen"); } else if ("Trons".equals(w)) { return Collections.singletonList("Trance"); } else if ("einzigste".equals(w)) { return Collections.singletonList("einzige"); } else if (word.endsWith("standart")) { return Collections.singletonList(word.replaceFirst("standart$", "standard")); } else if (word.endsWith("standarts")) { return Collections.singletonList(word.replaceFirst("standarts$", "standards")); } else if (word.equals("Rolladen")) { return Collections.singletonList("Rollladen"); } else if (word.equals("Maßname")) { return Collections.singletonList("Maßnahme"); } else if (word.equals("Maßnamen")) { return Collections.singletonList("Maßnahmen"); } else if (word.equals("nanten")) { return Collections.singletonList("nannten"); } else if (!StringTools.startsWithUppercase(word)) { String ucWord = StringTools.uppercaseFirstChar(word); if (!suggestions.contains(ucWord) && !hunspellDict.misspelled(ucWord)) { // Hunspell doesn't always automatically offer the most obvious suggestion for compounds: return Collections.singletonList(ucWord); } } String verbSuggestion = getPastTenseVerbSuggestion(word); if (verbSuggestion != null) { return Collections.singletonList(verbSuggestion); } String participleSuggestion = getParticipleSuggestion(word); if (participleSuggestion != null) { return Collections.singletonList(participleSuggestion); } return Collections.emptyList(); }
/** * Creates a RuleMatch object, taking the rule that triggered this match, position of the match * and an explanation message. This message is scanned for * <suggestion>...</suggestion> to get suggested fixes for the problem detected by * this rule. * * @param shortMessage used for example in OpenOffice/LibreOffice's context menu (may be null) * @param startWithUppercase whether the original text at the position of the match starts with an * uppercase character */ public RuleMatch( Rule rule, int fromPos, int toPos, String message, String shortMessage, boolean startWithUppercase, String suggestionsOutMsg) { this.rule = rule; if (toPos <= fromPos) { throw new RuntimeException( "fromPos (" + fromPos + ") must be less than toPos (" + toPos + ")"); } this.offsetPosition = new OffsetPosition(fromPos, toPos); this.message = message; this.shortMessage = shortMessage; // extract suggestion from <suggestion>...</suggestion> in message: final Matcher matcher = SUGGESTION_PATTERN.matcher(message + suggestionsOutMsg); int pos = 0; while (matcher.find(pos)) { pos = matcher.end(); String replacement = matcher.group(1); if (startWithUppercase) { replacement = StringTools.uppercaseFirstChar(replacement); } suggestedReplacements.add(replacement); } }
/** * @return a list of pronouns which match the person and number of @param verb * @param toUppercase true when the suggestions should be capitalized */ private List<String> getPronounSuggestions( final AnalyzedTokenReadings verb, final boolean toUppercase) { List<String> result = new ArrayList<>(); if (verb.hasPartialPosTag(":1:SIN")) { result.add("ich"); } if (verb.hasPartialPosTag(":2:SIN")) { result.add("du"); } if (verb.hasPartialPosTag(":3:SIN")) { result.add("er"); result.add("sie"); result.add("es"); } if (verb.hasPartialPosTag(":1:PLU")) { result.add("wir"); } if (verb.hasPartialPosTag(":2:PLU")) { result.add("ihr"); } if (verb.hasPartialPosTag(":3:PLU") && !result.contains("sie")) { // do not add "sie" twice result.add("sie"); } if (toUppercase) { for (int i = 0; i < result.size(); ++i) { result.set(i, StringTools.uppercaseFirstChar(result.get(i))); } } return result; }
/** * @return a list of forms of @param verb which match @param expectedVerbPOS (person:number) * @param toUppercase true when the suggestions should be capitalized */ private List<String> getVerbSuggestions( final AnalyzedTokenReadings verb, final String expectedVerbPOS, final boolean toUppercase) { // find the first verb reading AnalyzedToken verbToken = new AnalyzedToken("", "", ""); for (AnalyzedToken token : verb.getReadings()) { if (token.getPOSTag().startsWith("VER:")) { verbToken = token; break; } } try { String[] synthesized = german.getSynthesizer().synthesize(verbToken, "VER.*:" + expectedVerbPOS + ".*", true); // remove duplicates Set<String> suggestionSet = new HashSet<>(); suggestionSet.addAll(Arrays.asList(synthesized)); List<String> suggestions = new ArrayList<>(); suggestions.addAll(suggestionSet); if (toUppercase) { for (int i = 0; i < suggestions.size(); ++i) { suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i))); } } Collections.sort(suggestions); return suggestions; } catch (IOException e) { throw new RuntimeException(e); } }
public List<String> getSuggestions(String word) { final List<String> suggestions = new ArrayList<>(); try { suggestions.addAll(speller.findReplacements(word)); if (suggestions.isEmpty() && !word.toLowerCase(conversionLocale).equals(word)) { suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale))); } suggestions.addAll(speller.replaceRunOnWords(word)); } catch (CharacterCodingException e) { throw new RuntimeException(e); } if (dictionary.metadata.isConvertingCase() && StringTools.startsWithUppercase(word)) { for (int i = 0; i < suggestions.size(); i++) { suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i))); } } return suggestions; }
private boolean isNotAllUppercase(final String str) { final String[] parts = str.split(" "); for (String part : parts) { if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word if (StringTools.isAllUppercase(part)) { return false; } } } return true; }
@Override public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException { initializeIfRequired(); final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>(); int pos = 0; final IStemmer dictLookup = new DictionaryLookup(getDictionary()); for (String word : sentenceTokens) { final List<AnalyzedToken> l = new ArrayList<>(); final String lowerWord = word.toLowerCase(conversionLocale); final boolean isLowercase = word.equals(lowerWord); final boolean isMixedCase = StringTools.isMixedCase(word); List<AnalyzedToken> manualTaggerTokens = manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(word)); List<AnalyzedToken> manualLowerTaggerTokens = manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(lowerWord)); // normal case, manual tagger addTokens(manualTaggerTokens, l); // normal case, tagger dictionary if (manualTaggerTokens.isEmpty()) { addTokens(asAnalyzedTokenList(word, dictLookup.lookup(word)), l); } // tag non-lowercase words (alluppercase or startuppercase but not mixedcase) // with lowercase word tags if (!isLowercase && !isMixedCase) { // manual tagger addTokens(manualLowerTaggerTokens, l); // tagger dictionary if (manualLowerTaggerTokens.isEmpty()) { addTokens(asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)), l); } } // additional tagging with prefixes if (l.isEmpty() && !isMixedCase) { addTokens(additionalTags(word), l); } if (l.isEmpty()) { l.add(new AnalyzedToken(word, null, null)); } tokenReadings.add(new AnalyzedTokenReadings(l, pos)); pos += word.length(); } return tokenReadings; }
private List<PatternToken> getTokensForSentenceStart(String[] parts) { List<PatternToken> ucPatternTokens = new ArrayList<>(); int j = 0; for (String part : parts) { if (j == 0) { // at sentence start, we also need to accept a phrase that starts with an uppercase char: String uppercased = StringTools.uppercaseFirstChar(part); ucPatternTokens.add( new PatternTokenBuilder().posRegex(JLanguageTool.SENTENCE_START_TAGNAME).build()); ucPatternTokens.add(new PatternTokenBuilder().csToken(uppercased).build()); } else { ucPatternTokens.add(new PatternTokenBuilder().csToken(part).build()); } j++; } return ucPatternTokens; }
private void checkNumber(Attributes attrs) throws SAXException { if (StringTools.isEmpty(attrs.getValue("no"))) { throw new SAXException( "References cannot be empty: " + "\n Line: " + pLocator.getLineNumber() + ", column: " + pLocator.getColumnNumber() + "."); } else if (Integer.parseInt(attrs.getValue("no")) < 1) { throw new SAXException( "References must be larger than 0: " + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber() + ", column: " + pLocator.getColumnNumber() + "."); } }
public void testTranslationsAreNotEmpty() throws IOException { for (Language lang : Language.REAL_LANGUAGES) { final File file1 = getTranslationFile(lang); final File file2 = getTranslationFileWithVariant(lang); if (!file1.exists() && !file2.exists()) { System.err.println("Note: no translation available for " + lang); continue; } final File file = file1.exists() ? file1 : file2; final List<String> lines = loadFile(file); for (String line : lines) { line = line.trim(); if (StringTools.isEmpty(line) || line.charAt(0) == '#') { continue; } final String[] parts = line.split("="); if (parts.length < 2) { System.err.println("***** Empty translation: '" + line + "' in file " + file); // fail("Empty translation: '" + line + "' in file " + file); } } } }
/** * Accept (case-sensitively, unless at the start of a sentence) the given phrases even though they * are not in the built-in dictionary. Use this to avoid false alarms on e.g. names and technical * terms. Unlike {@link #addIgnoreTokens(List)} this can deal with phrases. A way to call this is * like this: <code>rule.acceptPhrases(Arrays.asList("duodenal atresia"))</code> This way, * checking would not create an error for "duodenal atresia", but it would still create and error * for "duodenal" or "atresia" if they appear on their own. * * @since 3.3 */ public void acceptPhrases(List<String> phrases) { List<List<PatternToken>> antiPatterns = new ArrayList<>(); for (String phrase : phrases) { String[] parts = phrase.split(" "); List<PatternToken> patternTokens = new ArrayList<>(); int i = 0; boolean startsLowercase = false; for (String part : parts) { if (i == 0) { String uppercased = StringTools.uppercaseFirstChar(part); if (!uppercased.equals(part)) { startsLowercase = true; } } patternTokens.add(new PatternTokenBuilder().csToken(part).build()); i++; } antiPatterns.add(patternTokens); if (startsLowercase) { antiPatterns.add(getTokensForSentenceStart(parts)); } } this.antiPatterns = makeAntiPatterns(antiPatterns, language); }
@Override public final RuleMatch[] match(final AnalyzedSentence text) { final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); if (tokens.length < 2) { return toRuleMatchArray(ruleMatches); } int matchTokenPos = 1; // 0 = SENT_START final String firstToken = tokens[matchTokenPos].getToken(); String secondToken = null; String thirdToken = null; // ignore quote characters: if (tokens.length >= 3 && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) { matchTokenPos = 2; secondToken = tokens[matchTokenPos].getToken(); } final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens); if (firstDutchToken != null) { thirdToken = firstDutchToken; matchTokenPos = 3; } String checkToken = firstToken; if (thirdToken != null) { checkToken = thirdToken; } else if (secondToken != null) { checkToken = secondToken; } String lastToken = tokens[tokens.length - 1].getToken(); if (lastToken.matches("[ \"'„»«“]") && tokens.length >= 2) { // ignore trailing whitespace or quote lastToken = tokens[tokens.length - 2].getToken(); } boolean preventError = false; // TODO: why do only *these* languages have that special case? final String langCode = language.getShortName(); final boolean languageHasSpecialCases = langCode.equals("ru") || langCode.equals("pl") || langCode.equals("uk") || langCode.equals("be") || langCode.equals(Locale.ENGLISH.getLanguage()) || langCode.equals(Locale.ITALIAN.getLanguage()) || langCode.equals(Locale.GERMAN.getLanguage()); if (languageHasSpecialCases) { // fix for lists; note - this will not always work for the last point in OOo, // as OOo might serve paragraphs in any order. if (";".equals(lastParagraphString) || ";".equals(lastToken) || ",".equals(lastParagraphString) || ",".equals(lastToken)) { preventError = true; } // fix for words in table (not sentences); note - this will not always work for the last point // in OOo, // as OOo might serve paragraphs in any order. if (!lastToken.matches("[.?!…]")) { preventError = true; } } lastParagraphString = lastToken; if (checkToken.length() > 0) { final char firstChar = checkToken.charAt(0); if (!preventError && Character.isLowerCase(firstChar)) { final RuleMatch ruleMatch = new RuleMatch( this, tokens[matchTokenPos].getStartPos(), tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(), messages.getString("incorrect_case")); ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken)); ruleMatches.add(ruleMatch); } } return toRuleMatchArray(ruleMatches); }
@Override public final RuleMatch[] match(final AnalyzedSentence sentence) { final List<RuleMatch> ruleMatches = new ArrayList<>(); final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); if (tokens.length < 2) { return toRuleMatchArray(ruleMatches); } int matchTokenPos = 1; // 0 = SENT_START final String firstToken = tokens[matchTokenPos].getToken(); String secondToken = null; String thirdToken = null; // ignore quote characters: if (tokens.length >= 3 && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) { matchTokenPos = 2; secondToken = tokens[matchTokenPos].getToken(); } final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens); if (firstDutchToken != null) { thirdToken = firstDutchToken; matchTokenPos = 3; } String checkToken = firstToken; if (thirdToken != null) { checkToken = thirdToken; } else if (secondToken != null) { checkToken = secondToken; } String lastToken = tokens[tokens.length - 1].getToken(); if (tokens.length >= 2 && WHITESPACE_OR_QUOTE.matcher(lastToken).matches()) { // ignore trailing whitespace or quote lastToken = tokens[tokens.length - 2].getToken(); } boolean preventError = false; if (lastParagraphString.equals(",") || lastParagraphString.equals(";")) { preventError = true; } if (!SENTENCE_END1.matcher(lastParagraphString).matches() && !SENTENCE_END2.matcher(lastToken).matches()) { preventError = true; } lastParagraphString = lastToken; // allows enumeration with lowercase letters: a), iv., etc. if (matchTokenPos + 1 < tokens.length && NUMERALS_EN.matcher(tokens[matchTokenPos].getToken()).matches() && (tokens[matchTokenPos + 1].getToken().equals(".") || tokens[matchTokenPos + 1].getToken().equals(")"))) { preventError = true; } if (isUrl(checkToken)) { preventError = true; } if (checkToken.length() > 0) { final char firstChar = checkToken.charAt(0); if (!preventError && Character.isLowerCase(firstChar)) { final RuleMatch ruleMatch = new RuleMatch( this, tokens[matchTokenPos].getStartPos(), tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(), messages.getString("incorrect_case")); ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken)); ruleMatches.add(ruleMatch); } } return toRuleMatchArray(ruleMatches); }
protected void finalizeTokens() { if (!exceptionSet || tokenElement == null) { tokenElement = new Element( StringTools.trimWhitespace(elements.toString()), caseSensitive, regExpression, tokenInflected); tokenElement.setNegation(tokenNegated); } else { tokenElement.setStringElement(StringTools.trimWhitespace(elements.toString())); } if (skipPos != 0) { tokenElement.setSkipNext(skipPos); skipPos = 0; } if (minOccurrence != 1) { tokenElement.setMinOccurrence(minOccurrence); minOccurrence = 1; } if (maxOccurrence != 1) { tokenElement.setMaxOccurrence(maxOccurrence); maxOccurrence = 1; } if (posToken != null) { tokenElement.setPosElement(posToken, posRegExp, posNegation); posToken = null; } if (chunkTag != null) { tokenElement.setChunkElement(chunkTag); chunkTag = null; } if (tokenReference != null) { tokenElement.setMatch(tokenReference); } if (inAndGroup && andGroupCounter > 0) { elementList.get(elementList.size() - 1).setAndGroupElement(tokenElement); } else if (inOrGroup && orGroupCounter > 0) { elementList.get(elementList.size() - 1).setOrGroupElement(tokenElement); } else { elementList.add(tokenElement); } if (inAndGroup) { andGroupCounter++; } if (inOrGroup) { orGroupCounter++; } if (inUnification) { tokenElement.setUnification(equivalenceFeatures); } tokenElement.setInsideMarker(inMarker); if (inUnificationDef) { language.getUnifierConfiguration().setEquivalence(uFeature, uType, tokenElement); elementList.clear(); } if (tokenSpaceBeforeSet) { tokenElement.setWhitespaceBefore(tokenSpaceBefore); } resetToken(); }