Beispiel #1
0
 protected void finalizeExceptions() {
   inException = false;
   if (!exceptionSet) {
     tokenElement =
         new Element(
             StringTools.trimWhitespace(elements.toString()),
             caseSensitive,
             regExpression,
             tokenInflected);
     exceptionSet = true;
   }
   tokenElement.setNegation(tokenNegated);
   if (!StringTools.isEmpty(exceptions.toString()) || exceptionPosToken != null) {
     tokenElement.setStringPosException(
         StringTools.trimWhitespace(exceptions.toString()),
         exceptionStringRegExp,
         exceptionStringInflected,
         exceptionStringNegation,
         exceptionValidNext,
         exceptionValidPrev,
         exceptionPosToken,
         exceptionPosRegExp,
         exceptionPosNegation);
     exceptionPosToken = null;
   }
   if (exceptionSpaceBeforeSet) {
     tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore);
   }
   resetException();
 }
 private boolean urlEndsAt(int i, List<String> l, String urlQuote) {
   String token = l.get(i);
   if (StringTools.isWhitespace(token)) {
     return true;
   } else if (token.equals(")") || token.equals("]")) { // this is guesswork
     return true;
   } else if (l.size() > i + 1) {
     String nToken = l.get(i + 1);
     if (StringTools.isWhitespace(nToken)
         && (token.equals(".")
             || token.equals(",")
             || token.equals(";")
             || token.equals(":")
             || token.equals("!")
             || token.equals("?")
             || token.equals(urlQuote))) {
       return true;
     }
   } else {
     Matcher matcher = URL_CHARS.matcher(token);
     if (!matcher.matches()) {
       return true;
     }
   }
   return false;
 }
 @Override
 protected List<String> getAdditionalTopSuggestions(List<String> suggestions, String word) {
   String w = word.replaceFirst("\\.$", "");
   if ("unzwar".equals(w)) {
     return Collections.singletonList("und zwar");
   } else if ("desweiteren".equals(w)) {
     return Collections.singletonList("des Weiteren");
   } else if ("wieviel".equals(w)) {
     return Collections.singletonList("wie viel");
   } else if ("wieviele".equals(w)) {
     return Collections.singletonList("wie viele");
   } else if ("wievielen".equals(w)) {
     return Collections.singletonList("wie vielen");
   } else if ("vorteilen".equals(w)) {
     return Collections.singletonList("Vorteilen");
   } else if ("Trons".equals(w)) {
     return Collections.singletonList("Trance");
   } else if ("einzigste".equals(w)) {
     return Collections.singletonList("einzige");
   } else if (word.endsWith("standart")) {
     return Collections.singletonList(word.replaceFirst("standart$", "standard"));
   } else if (word.endsWith("standarts")) {
     return Collections.singletonList(word.replaceFirst("standarts$", "standards"));
   } else if (word.equals("Rolladen")) {
     return Collections.singletonList("Rollladen");
   } else if (word.equals("Maßname")) {
     return Collections.singletonList("Maßnahme");
   } else if (word.equals("Maßnamen")) {
     return Collections.singletonList("Maßnahmen");
   } else if (word.equals("nanten")) {
     return Collections.singletonList("nannten");
   } else if (!StringTools.startsWithUppercase(word)) {
     String ucWord = StringTools.uppercaseFirstChar(word);
     if (!suggestions.contains(ucWord) && !hunspellDict.misspelled(ucWord)) {
       // Hunspell doesn't always automatically offer the most obvious suggestion for compounds:
       return Collections.singletonList(ucWord);
     }
   }
   String verbSuggestion = getPastTenseVerbSuggestion(word);
   if (verbSuggestion != null) {
     return Collections.singletonList(verbSuggestion);
   }
   String participleSuggestion = getParticipleSuggestion(word);
   if (participleSuggestion != null) {
     return Collections.singletonList(participleSuggestion);
   }
   return Collections.emptyList();
 }
Beispiel #4
0
 /**
  * Creates a RuleMatch object, taking the rule that triggered this match, position of the match
  * and an explanation message. This message is scanned for
  * &lt;suggestion&gt;...&lt;/suggestion&gt; to get suggested fixes for the problem detected by
  * this rule.
  *
  * @param shortMessage used for example in OpenOffice/LibreOffice's context menu (may be null)
  * @param startWithUppercase whether the original text at the position of the match starts with an
  *     uppercase character
  */
 public RuleMatch(
     Rule rule,
     int fromPos,
     int toPos,
     String message,
     String shortMessage,
     boolean startWithUppercase,
     String suggestionsOutMsg) {
   this.rule = rule;
   if (toPos <= fromPos) {
     throw new RuntimeException(
         "fromPos (" + fromPos + ") must be less than toPos (" + toPos + ")");
   }
   this.offsetPosition = new OffsetPosition(fromPos, toPos);
   this.message = message;
   this.shortMessage = shortMessage;
   // extract suggestion from <suggestion>...</suggestion> in message:
   final Matcher matcher = SUGGESTION_PATTERN.matcher(message + suggestionsOutMsg);
   int pos = 0;
   while (matcher.find(pos)) {
     pos = matcher.end();
     String replacement = matcher.group(1);
     if (startWithUppercase) {
       replacement = StringTools.uppercaseFirstChar(replacement);
     }
     suggestedReplacements.add(replacement);
   }
 }
 /**
  * @return a list of pronouns which match the person and number of @param verb
  * @param toUppercase true when the suggestions should be capitalized
  */
 private List<String> getPronounSuggestions(
     final AnalyzedTokenReadings verb, final boolean toUppercase) {
   List<String> result = new ArrayList<>();
   if (verb.hasPartialPosTag(":1:SIN")) {
     result.add("ich");
   }
   if (verb.hasPartialPosTag(":2:SIN")) {
     result.add("du");
   }
   if (verb.hasPartialPosTag(":3:SIN")) {
     result.add("er");
     result.add("sie");
     result.add("es");
   }
   if (verb.hasPartialPosTag(":1:PLU")) {
     result.add("wir");
   }
   if (verb.hasPartialPosTag(":2:PLU")) {
     result.add("ihr");
   }
   if (verb.hasPartialPosTag(":3:PLU") && !result.contains("sie")) { // do not add "sie" twice
     result.add("sie");
   }
   if (toUppercase) {
     for (int i = 0; i < result.size(); ++i) {
       result.set(i, StringTools.uppercaseFirstChar(result.get(i)));
     }
   }
   return result;
 }
  /**
   * @return a list of forms of @param verb which match @param expectedVerbPOS (person:number)
   * @param toUppercase true when the suggestions should be capitalized
   */
  private List<String> getVerbSuggestions(
      final AnalyzedTokenReadings verb, final String expectedVerbPOS, final boolean toUppercase) {
    // find the first verb reading
    AnalyzedToken verbToken = new AnalyzedToken("", "", "");
    for (AnalyzedToken token : verb.getReadings()) {
      if (token.getPOSTag().startsWith("VER:")) {
        verbToken = token;
        break;
      }
    }

    try {
      String[] synthesized =
          german.getSynthesizer().synthesize(verbToken, "VER.*:" + expectedVerbPOS + ".*", true);
      // remove duplicates
      Set<String> suggestionSet = new HashSet<>();
      suggestionSet.addAll(Arrays.asList(synthesized));
      List<String> suggestions = new ArrayList<>();
      suggestions.addAll(suggestionSet);
      if (toUppercase) {
        for (int i = 0; i < suggestions.size(); ++i) {
          suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i)));
        }
      }
      Collections.sort(suggestions);
      return suggestions;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
 public List<String> getSuggestions(String word) {
   final List<String> suggestions = new ArrayList<>();
   try {
     suggestions.addAll(speller.findReplacements(word));
     if (suggestions.isEmpty() && !word.toLowerCase(conversionLocale).equals(word)) {
       suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
     }
     suggestions.addAll(speller.replaceRunOnWords(word));
   } catch (CharacterCodingException e) {
     throw new RuntimeException(e);
   }
   if (dictionary.metadata.isConvertingCase() && StringTools.startsWithUppercase(word)) {
     for (int i = 0; i < suggestions.size(); i++) {
       suggestions.set(i, StringTools.uppercaseFirstChar(suggestions.get(i)));
     }
   }
   return suggestions;
 }
 private boolean isNotAllUppercase(final String str) {
   final String[] parts = str.split(" ");
   for (String part : parts) {
     if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word
       if (StringTools.isAllUppercase(part)) {
         return false;
       }
     }
   }
   return true;
 }
  @Override
  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
    initializeIfRequired();

    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);
      List<AnalyzedToken> manualTaggerTokens =
          manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(word));
      List<AnalyzedToken> manualLowerTaggerTokens =
          manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(lowerWord));

      // normal case, manual tagger
      addTokens(manualTaggerTokens, l);
      // normal case, tagger dictionary
      if (manualTaggerTokens.isEmpty()) {
        addTokens(asAnalyzedTokenList(word, dictLookup.lookup(word)), l);
      }
      // tag non-lowercase words (alluppercase or startuppercase but not mixedcase)
      // with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        // manual tagger
        addTokens(manualLowerTaggerTokens, l);
        // tagger dictionary
        if (manualLowerTaggerTokens.isEmpty()) {
          addTokens(asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)), l);
        }
      }
      // additional tagging with prefixes
      if (l.isEmpty() && !isMixedCase) {
        addTokens(additionalTags(word), l);
      }

      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }

      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }

    return tokenReadings;
  }
 private List<PatternToken> getTokensForSentenceStart(String[] parts) {
   List<PatternToken> ucPatternTokens = new ArrayList<>();
   int j = 0;
   for (String part : parts) {
     if (j == 0) {
       // at sentence start, we also need to accept a phrase that starts with an uppercase char:
       String uppercased = StringTools.uppercaseFirstChar(part);
       ucPatternTokens.add(
           new PatternTokenBuilder().posRegex(JLanguageTool.SENTENCE_START_TAGNAME).build());
       ucPatternTokens.add(new PatternTokenBuilder().csToken(uppercased).build());
     } else {
       ucPatternTokens.add(new PatternTokenBuilder().csToken(part).build());
     }
     j++;
   }
   return ucPatternTokens;
 }
Beispiel #11
0
 private void checkNumber(Attributes attrs) throws SAXException {
   if (StringTools.isEmpty(attrs.getValue("no"))) {
     throw new SAXException(
         "References cannot be empty: "
             + "\n Line: "
             + pLocator.getLineNumber()
             + ", column: "
             + pLocator.getColumnNumber()
             + ".");
   } else if (Integer.parseInt(attrs.getValue("no")) < 1) {
     throw new SAXException(
         "References must be larger than 0: "
             + attrs.getValue("no")
             + "\n Line: "
             + pLocator.getLineNumber()
             + ", column: "
             + pLocator.getColumnNumber()
             + ".");
   }
 }
Beispiel #12
0
 public void testTranslationsAreNotEmpty() throws IOException {
   for (Language lang : Language.REAL_LANGUAGES) {
     final File file1 = getTranslationFile(lang);
     final File file2 = getTranslationFileWithVariant(lang);
     if (!file1.exists() && !file2.exists()) {
       System.err.println("Note: no translation available for " + lang);
       continue;
     }
     final File file = file1.exists() ? file1 : file2;
     final List<String> lines = loadFile(file);
     for (String line : lines) {
       line = line.trim();
       if (StringTools.isEmpty(line) || line.charAt(0) == '#') {
         continue;
       }
       final String[] parts = line.split("=");
       if (parts.length < 2) {
         System.err.println("***** Empty translation: '" + line + "' in file " + file);
         // fail("Empty translation: '" + line + "' in file " + file);
       }
     }
   }
 }
 /**
  * Accept (case-sensitively, unless at the start of a sentence) the given phrases even though they
  * are not in the built-in dictionary. Use this to avoid false alarms on e.g. names and technical
  * terms. Unlike {@link #addIgnoreTokens(List)} this can deal with phrases. A way to call this is
  * like this: <code>rule.acceptPhrases(Arrays.asList("duodenal atresia"))</code> This way,
  * checking would not create an error for "duodenal atresia", but it would still create and error
  * for "duodenal" or "atresia" if they appear on their own.
  *
  * @since 3.3
  */
 public void acceptPhrases(List<String> phrases) {
   List<List<PatternToken>> antiPatterns = new ArrayList<>();
   for (String phrase : phrases) {
     String[] parts = phrase.split(" ");
     List<PatternToken> patternTokens = new ArrayList<>();
     int i = 0;
     boolean startsLowercase = false;
     for (String part : parts) {
       if (i == 0) {
         String uppercased = StringTools.uppercaseFirstChar(part);
         if (!uppercased.equals(part)) {
           startsLowercase = true;
         }
       }
       patternTokens.add(new PatternTokenBuilder().csToken(part).build());
       i++;
     }
     antiPatterns.add(patternTokens);
     if (startsLowercase) {
       antiPatterns.add(getTokensForSentenceStart(parts));
     }
   }
   this.antiPatterns = makeAntiPatterns(antiPatterns, language);
 }
  @Override
  public final RuleMatch[] match(final AnalyzedSentence text) {
    final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
    final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
    if (tokens.length < 2) {
      return toRuleMatchArray(ruleMatches);
    }
    int matchTokenPos = 1; // 0 = SENT_START
    final String firstToken = tokens[matchTokenPos].getToken();
    String secondToken = null;
    String thirdToken = null;
    // ignore quote characters:
    if (tokens.length >= 3
        && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) {
      matchTokenPos = 2;
      secondToken = tokens[matchTokenPos].getToken();
    }
    final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens);
    if (firstDutchToken != null) {
      thirdToken = firstDutchToken;
      matchTokenPos = 3;
    }

    String checkToken = firstToken;
    if (thirdToken != null) {
      checkToken = thirdToken;
    } else if (secondToken != null) {
      checkToken = secondToken;
    }

    String lastToken = tokens[tokens.length - 1].getToken();
    if (lastToken.matches("[ \"'„»«“]") && tokens.length >= 2) {
      // ignore trailing whitespace or quote
      lastToken = tokens[tokens.length - 2].getToken();
    }

    boolean preventError = false;
    // TODO: why do only *these* languages have that special case?
    final String langCode = language.getShortName();
    final boolean languageHasSpecialCases =
        langCode.equals("ru")
            || langCode.equals("pl")
            || langCode.equals("uk")
            || langCode.equals("be")
            || langCode.equals(Locale.ENGLISH.getLanguage())
            || langCode.equals(Locale.ITALIAN.getLanguage())
            || langCode.equals(Locale.GERMAN.getLanguage());
    if (languageHasSpecialCases) {
      // fix for lists; note - this will not always work for the last point in OOo,
      // as OOo might serve paragraphs in any order.
      if (";".equals(lastParagraphString)
          || ";".equals(lastToken)
          || ",".equals(lastParagraphString)
          || ",".equals(lastToken)) {
        preventError = true;
      }
      // fix for words in table (not sentences); note - this will not always work for the last point
      // in OOo,
      // as OOo might serve paragraphs in any order.
      if (!lastToken.matches("[.?!…]")) {
        preventError = true;
      }
    }

    lastParagraphString = lastToken;

    if (checkToken.length() > 0) {
      final char firstChar = checkToken.charAt(0);
      if (!preventError && Character.isLowerCase(firstChar)) {
        final RuleMatch ruleMatch =
            new RuleMatch(
                this,
                tokens[matchTokenPos].getStartPos(),
                tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(),
                messages.getString("incorrect_case"));
        ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken));
        ruleMatches.add(ruleMatch);
      }
    }
    return toRuleMatchArray(ruleMatches);
  }
  @Override
  public final RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    if (tokens.length < 2) {
      return toRuleMatchArray(ruleMatches);
    }
    int matchTokenPos = 1; // 0 = SENT_START
    final String firstToken = tokens[matchTokenPos].getToken();
    String secondToken = null;
    String thirdToken = null;
    // ignore quote characters:
    if (tokens.length >= 3
        && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) {
      matchTokenPos = 2;
      secondToken = tokens[matchTokenPos].getToken();
    }
    final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens);
    if (firstDutchToken != null) {
      thirdToken = firstDutchToken;
      matchTokenPos = 3;
    }

    String checkToken = firstToken;
    if (thirdToken != null) {
      checkToken = thirdToken;
    } else if (secondToken != null) {
      checkToken = secondToken;
    }

    String lastToken = tokens[tokens.length - 1].getToken();
    if (tokens.length >= 2 && WHITESPACE_OR_QUOTE.matcher(lastToken).matches()) {
      // ignore trailing whitespace or quote
      lastToken = tokens[tokens.length - 2].getToken();
    }

    boolean preventError = false;
    if (lastParagraphString.equals(",") || lastParagraphString.equals(";")) {
      preventError = true;
    }
    if (!SENTENCE_END1.matcher(lastParagraphString).matches()
        && !SENTENCE_END2.matcher(lastToken).matches()) {
      preventError = true;
    }

    lastParagraphString = lastToken;

    // allows enumeration with lowercase letters: a), iv., etc.
    if (matchTokenPos + 1 < tokens.length
        && NUMERALS_EN.matcher(tokens[matchTokenPos].getToken()).matches()
        && (tokens[matchTokenPos + 1].getToken().equals(".")
            || tokens[matchTokenPos + 1].getToken().equals(")"))) {
      preventError = true;
    }

    if (isUrl(checkToken)) {
      preventError = true;
    }

    if (checkToken.length() > 0) {
      final char firstChar = checkToken.charAt(0);
      if (!preventError && Character.isLowerCase(firstChar)) {
        final RuleMatch ruleMatch =
            new RuleMatch(
                this,
                tokens[matchTokenPos].getStartPos(),
                tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(),
                messages.getString("incorrect_case"));
        ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken));
        ruleMatches.add(ruleMatch);
      }
    }
    return toRuleMatchArray(ruleMatches);
  }
Beispiel #16
0
  protected void finalizeTokens() {
    if (!exceptionSet || tokenElement == null) {
      tokenElement =
          new Element(
              StringTools.trimWhitespace(elements.toString()),
              caseSensitive,
              regExpression,
              tokenInflected);
      tokenElement.setNegation(tokenNegated);
    } else {
      tokenElement.setStringElement(StringTools.trimWhitespace(elements.toString()));
    }

    if (skipPos != 0) {
      tokenElement.setSkipNext(skipPos);
      skipPos = 0;
    }
    if (minOccurrence != 1) {
      tokenElement.setMinOccurrence(minOccurrence);
      minOccurrence = 1;
    }
    if (maxOccurrence != 1) {
      tokenElement.setMaxOccurrence(maxOccurrence);
      maxOccurrence = 1;
    }
    if (posToken != null) {
      tokenElement.setPosElement(posToken, posRegExp, posNegation);
      posToken = null;
    }
    if (chunkTag != null) {
      tokenElement.setChunkElement(chunkTag);
      chunkTag = null;
    }

    if (tokenReference != null) {
      tokenElement.setMatch(tokenReference);
    }

    if (inAndGroup && andGroupCounter > 0) {
      elementList.get(elementList.size() - 1).setAndGroupElement(tokenElement);
    } else if (inOrGroup && orGroupCounter > 0) {
      elementList.get(elementList.size() - 1).setOrGroupElement(tokenElement);
    } else {
      elementList.add(tokenElement);
    }
    if (inAndGroup) {
      andGroupCounter++;
    }
    if (inOrGroup) {
      orGroupCounter++;
    }

    if (inUnification) {
      tokenElement.setUnification(equivalenceFeatures);
    }

    tokenElement.setInsideMarker(inMarker);

    if (inUnificationDef) {
      language.getUnifierConfiguration().setEquivalence(uFeature, uType, tokenElement);
      elementList.clear();
    }
    if (tokenSpaceBeforeSet) {
      tokenElement.setWhitespaceBefore(tokenSpaceBefore);
    }
    resetToken();
  }