private static String getSentence(final AnalyzedSentence sent) {
   final StringBuilder sb = new StringBuilder();
   sb.append("<S>");
   for (final AnalyzedTokenReadings atr : sent.getTokensWithoutWhitespace()) {
     sb.append(getPOS(atr));
     sb.append(' ');
   }
   sb.append("</S>");
   return sb.toString();
 }
  public void testCompareLists() throws IOException {
    AnalyzedSentence sentence1 = langTool.getAnalyzedSentence("Hier ein Test");
    assertTrue(
        rule.compareLists(
            sentence1.getTokensWithoutWhitespace(), 0, 2, new String[] {"", "Hier", "ein"}));
    assertTrue(
        rule.compareLists(
            sentence1.getTokensWithoutWhitespace(), 1, 2, new String[] {"Hier", "ein"}));
    assertTrue(
        rule.compareLists(
            sentence1.getTokensWithoutWhitespace(),
            0,
            3,
            new String[] {"", "Hier", "ein", "Test"}));
    assertFalse(
        rule.compareLists(
            sentence1.getTokensWithoutWhitespace(),
            0,
            4,
            new String[] {"", "Hier", "ein", "Test"}));

    AnalyzedSentence sentence2 = langTool.getAnalyzedSentence("das Heilige Römische Reich");
    assertTrue(
        rule.compareLists(
            sentence2.getTokensWithoutWhitespace(),
            0,
            4,
            new String[] {"", "das", "Heilige", "Römische", "Reich"}));
    assertFalse(
        rule.compareLists(
            sentence2.getTokensWithoutWhitespace(),
            8,
            11,
            new String[] {"", "das", "Heilige", "Römische", "Reich"}));
  }
  @Override
  public RuleMatch[] match(final AnalyzedSentence text) {
    final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
    final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();

    if (tokens.length > 3) {
      final AnalyzedTokenReadings analyzedToken = tokens[1];
      final String token = analyzedToken.getToken();
      // avoid "..." etc. to be matched:
      boolean isWord = true;
      if (token.length() == 1) {
        final char c = token.charAt(0);
        if (!Character.isLetter(c)) {
          isWord = false;
        }
      }

      if (isWord
          && lastToken.equals(token)
          && !isException(token)
          && !isException(tokens[2].getToken())
          && !isException(tokens[3].getToken())) {
        final String shortMsg;
        if (isAdverb(analyzedToken)) {
          shortMsg = messages.getString("desc_repetition_beginning_adv");
        } else if (beforeLastToken.equals(token)) {
          shortMsg = messages.getString("desc_repetition_beginning_word");
        } else {
          shortMsg = "";
        }

        if (!shortMsg.equals("")) {
          final String msg =
              shortMsg + " " + messages.getString("desc_repetition_beginning_thesaurus");
          final int startPos = analyzedToken.getStartPos();
          final int endPos = startPos + token.length();
          final RuleMatch ruleMatch = new RuleMatch(this, startPos, endPos, msg, shortMsg);
          ruleMatches.add(ruleMatch);
        }
      }
      beforeLastToken = lastToken;
      lastToken = token;
    }

    // TODO should we ignore repetitions involving multiple paragraphs?
    // if (tokens[tokens.length - 1].isParaEnd()) beforeLastToken = "";

    return toRuleMatchArray(ruleMatches);
  }
Example #4
0
 @Override
 public RuleMatch[] match(final AnalyzedSentence sentence) {
   final List<RuleMatch> ruleMatches = new ArrayList<>();
   final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
   String prevToken = null;
   for (int i = 0; i < tokens.length; i++) {
     final String token = tokens[i].getToken();
     if (tokens[i].isWhitespace()) {
       // ignore
       continue;
     }
     if (prevToken != null
         && !prevToken.equals("-")
         && !prevToken.contains("--")
         && !prevToken.contains(
             "–-") // first char is some special kind of dash, found in Wikipedia
         && prevToken.endsWith("-")) {
       final char firstChar = token.charAt(0);
       if (Character.isUpperCase(firstChar)) {
         final String msg =
             "Möglicherweise fehlt ein 'und' oder ein Komma, oder es wurde nach dem Wort "
                 + "ein überflüssiges Leerzeichen eingefügt. Eventuell haben Sie auch versehentlich einen Bindestrich statt eines Punktes eingefügt.";
         final RuleMatch ruleMatch =
             new RuleMatch(
                 this,
                 tokens[i - 1].getStartPos(),
                 tokens[i - 1].getStartPos() + prevToken.length() + 1,
                 msg);
         ruleMatch.setSuggestedReplacement(tokens[i - 1].getToken());
         ruleMatches.add(ruleMatch);
       }
     }
     prevToken = token;
   }
   return toRuleMatchArray(ruleMatches);
 }
  @Override
  public final RuleMatch[] match(final AnalyzedSentence text) {
    final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
    final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
    if (tokens.length < 2) {
      return toRuleMatchArray(ruleMatches);
    }
    int matchTokenPos = 1; // 0 = SENT_START
    final String firstToken = tokens[matchTokenPos].getToken();
    String secondToken = null;
    String thirdToken = null;
    // ignore quote characters:
    if (tokens.length >= 3
        && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) {
      matchTokenPos = 2;
      secondToken = tokens[matchTokenPos].getToken();
    }
    final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens);
    if (firstDutchToken != null) {
      thirdToken = firstDutchToken;
      matchTokenPos = 3;
    }

    String checkToken = firstToken;
    if (thirdToken != null) {
      checkToken = thirdToken;
    } else if (secondToken != null) {
      checkToken = secondToken;
    }

    String lastToken = tokens[tokens.length - 1].getToken();
    if (lastToken.matches("[ \"'„»«“]") && tokens.length >= 2) {
      // ignore trailing whitespace or quote
      lastToken = tokens[tokens.length - 2].getToken();
    }

    boolean preventError = false;
    // TODO: why do only *these* languages have that special case?
    final String langCode = language.getShortName();
    final boolean languageHasSpecialCases =
        langCode.equals("ru")
            || langCode.equals("pl")
            || langCode.equals("uk")
            || langCode.equals("be")
            || langCode.equals(Locale.ENGLISH.getLanguage())
            || langCode.equals(Locale.ITALIAN.getLanguage())
            || langCode.equals(Locale.GERMAN.getLanguage());
    if (languageHasSpecialCases) {
      // fix for lists; note - this will not always work for the last point in OOo,
      // as OOo might serve paragraphs in any order.
      if (";".equals(lastParagraphString)
          || ";".equals(lastToken)
          || ",".equals(lastParagraphString)
          || ",".equals(lastToken)) {
        preventError = true;
      }
      // fix for words in table (not sentences); note - this will not always work for the last point
      // in OOo,
      // as OOo might serve paragraphs in any order.
      if (!lastToken.matches("[.?!…]")) {
        preventError = true;
      }
    }

    lastParagraphString = lastToken;

    if (checkToken.length() > 0) {
      final char firstChar = checkToken.charAt(0);
      if (!preventError && Character.isLowerCase(firstChar)) {
        final RuleMatch ruleMatch =
            new RuleMatch(
                this,
                tokens[matchTokenPos].getStartPos(),
                tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(),
                messages.getString("incorrect_case"));
        ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken));
        ruleMatches.add(ruleMatch);
      }
    }
    return toRuleMatchArray(ruleMatches);
  }
  @Override
  public final RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    if (tokens.length < 2) {
      return toRuleMatchArray(ruleMatches);
    }
    int matchTokenPos = 1; // 0 = SENT_START
    final String firstToken = tokens[matchTokenPos].getToken();
    String secondToken = null;
    String thirdToken = null;
    // ignore quote characters:
    if (tokens.length >= 3
        && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) {
      matchTokenPos = 2;
      secondToken = tokens[matchTokenPos].getToken();
    }
    final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens);
    if (firstDutchToken != null) {
      thirdToken = firstDutchToken;
      matchTokenPos = 3;
    }

    String checkToken = firstToken;
    if (thirdToken != null) {
      checkToken = thirdToken;
    } else if (secondToken != null) {
      checkToken = secondToken;
    }

    String lastToken = tokens[tokens.length - 1].getToken();
    if (tokens.length >= 2 && WHITESPACE_OR_QUOTE.matcher(lastToken).matches()) {
      // ignore trailing whitespace or quote
      lastToken = tokens[tokens.length - 2].getToken();
    }

    boolean preventError = false;
    if (lastParagraphString.equals(",") || lastParagraphString.equals(";")) {
      preventError = true;
    }
    if (!SENTENCE_END1.matcher(lastParagraphString).matches()
        && !SENTENCE_END2.matcher(lastToken).matches()) {
      preventError = true;
    }

    lastParagraphString = lastToken;

    // allows enumeration with lowercase letters: a), iv., etc.
    if (matchTokenPos + 1 < tokens.length
        && NUMERALS_EN.matcher(tokens[matchTokenPos].getToken()).matches()
        && (tokens[matchTokenPos + 1].getToken().equals(".")
            || tokens[matchTokenPos + 1].getToken().equals(")"))) {
      preventError = true;
    }

    if (isUrl(checkToken)) {
      preventError = true;
    }

    if (checkToken.length() > 0) {
      final char firstChar = checkToken.charAt(0);
      if (!preventError && Character.isLowerCase(firstChar)) {
        final RuleMatch ruleMatch =
            new RuleMatch(
                this,
                tokens[matchTokenPos].getStartPos(),
                tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(),
                messages.getString("incorrect_case"));
        ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken));
        ruleMatches.add(ruleMatch);
      }
    }
    return toRuleMatchArray(ruleMatches);
  }
  @Override
  public RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

    RuleMatch prevRuleMatch = null;
    final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
      final AnalyzedTokenReadings token;
      // we need to extend the token list so we find matches at the end of the original list:
      if (i >= tokens.length) {
        token =
            new AnalyzedTokenReadings(
                new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
      } else {
        token = tokens[i];
      }
      if (i == 0) {
        addToQueue(token, prevTokens);
        continue;
      }

      final StringBuilder sb = new StringBuilder();
      int j = 0;
      AnalyzedTokenReadings firstMatchToken = null;
      final List<String> stringsToCheck = new ArrayList<>();
      final List<String> origStringsToCheck =
          new ArrayList<>(); // original upper/lowercase spelling
      final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>();
      for (AnalyzedTokenReadings atr : prevTokens) {
        if (j == 0) {
          firstMatchToken = atr;
        }
        sb.append(' ');
        sb.append(atr.getToken());
        if (j >= 1) {
          final String stringToCheck = normalize(sb.toString());
          stringsToCheck.add(stringToCheck);
          origStringsToCheck.add(sb.toString().trim());
          if (!stringToToken.containsKey(stringToCheck)) {
            stringToToken.put(stringToCheck, atr);
          }
        }
        j++;
      }
      // iterate backwards over all potentially incorrect strings to make
      // sure we match longer strings first:
      for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
        final String stringToCheck = stringsToCheck.get(k);
        final String origStringToCheck = origStringsToCheck.get(k);
        if (incorrectCompounds.contains(stringToCheck)) {
          final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
          String msg = null;
          final List<String> replacement = new ArrayList<>();
          if (!noDashSuggestion.contains(stringToCheck)) {
            replacement.add(origStringToCheck.replace(' ', '-'));
            msg = withHyphenMessage;
          }
          if (isNotAllUppercase(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) {
            replacement.add(mergeCompound(origStringToCheck));
            msg = withoutHyphenMessage;
          }
          final String[] parts = stringToCheck.split(" ");
          if (parts.length > 0 && parts[0].length() == 1) {
            replacement.clear();
            replacement.add(origStringToCheck.replace(' ', '-'));
            msg = withHyphenMessage;
          } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen
            msg = withOrWithoutHyphenMessage;
          }
          final RuleMatch ruleMatch =
              new RuleMatch(
                  this,
                  firstMatchToken.getStartPos(),
                  atr.getStartPos() + atr.getToken().length(),
                  msg,
                  shortDesc);
          // avoid duplicate matches:
          if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
            prevRuleMatch = ruleMatch;
            break;
          }
          prevRuleMatch = ruleMatch;
          ruleMatch.setSuggestedReplacements(replacement);
          ruleMatches.add(ruleMatch);
          break;
        }
      }
      addToQueue(token, prevTokens);
    }
    return toRuleMatchArray(ruleMatches);
  }
  @Override
  public RuleMatch[] match(final AnalyzedSentence sentence) {

    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

    if (tokens.length < 4) { // ignore one-word sentences (3 tokens: SENT_START, one word, SENT_END)
      return toRuleMatchArray(ruleMatches);
    }

    // position of the pronouns:
    int posIch = -1;
    int posDu = -1;
    int posEr = -1;
    int posWir = -1;
    // positions of verbs which do match in person and number, and do not match any other person nor
    // number:
    int posVer1Sin = -1;
    int posVer2Sin = -1;
    int posVer1Plu = -1;
    /*int posVer2Plu = -1;*/
    // positions of verbs which do match in person and number:
    int posPossibleVer1Sin = -1;
    int posPossibleVer2Sin = -1;
    int posPossibleVer3Sin = -1;
    int posPossibleVer1Plu = -1;
    /*int posPossibleVer2Plu = -1;*/

    for (int i = 1; i < tokens.length; ++i) { // ignore SENT_START

      String strToken = tokens[i].getToken().toLowerCase();
      strToken = strToken.replace("‚", "");

      if (strToken.equals("ich")) {
        posIch = i;
      } else if (strToken.equals("du")) {
        posDu = i;
      } else if (strToken.equals("er")) {
        posEr = i;
      } else if (strToken.equals("wir")) {
        posWir = i;
      }

      if (tokens[i].hasPartialPosTag("VER")
          && (Character.isLowerCase(tokens[i].getToken().charAt(0)) || i == 1)) {
        if (hasUnambiguouslyPersonAndNumber(tokens[i], "1", "SIN")
            && !(strToken.equals("bin")
                && (BIN_IGNORE.contains(tokens[i - 1].getToken())
                    || (tokens.length != i + 1 && tokens[i + 1].getToken().startsWith("Laden"))))) {
          posVer1Sin = i;
        } else if (hasUnambiguouslyPersonAndNumber(tokens[i], "2", "SIN")) {
          posVer2Sin = i;
        } else if (hasUnambiguouslyPersonAndNumber(tokens[i], "1", "PLU")) {
          posVer1Plu = i;
          //      } else if (hasUnambiguouslyPersonAndNumber(tokens[i], "2", "PLU")) {
          //        posVer2Plu = i;
        }

        if (tokens[i].hasPartialPosTag(":1:SIN")) {
          posPossibleVer1Sin = i;
        }
        if (tokens[i].hasPartialPosTag(":2:SIN")) {
          posPossibleVer2Sin = i;
        }
        if (tokens[i].hasPartialPosTag(":3:SIN")) {
          posPossibleVer3Sin = i;
        }
        if (tokens[i].hasPartialPosTag(":1:PLU")) {
          posPossibleVer1Plu = i;
        }
        //      if (tokens[i].hasPartialPosTag(":2:PLU"))
        //        posPossibleVer2Plu = i;

      }
    } // for each token

    // "ich", "du", and "wir" must be subject (no other interpretation possible)
    // "ich", "du", "er", and "wir" must have a matching verb

    if (posVer1Sin != -1
        && posIch == -1
        && !isQuotationMark(tokens[posVer1Sin - 1])) { // 1st pers sg verb but no "ich"
      ruleMatches.add(ruleMatchWrongVerb(tokens[posVer1Sin]));
    } else if (posIch > 0
        && !isNear(posPossibleVer1Sin, posIch) // check whether verb next to "ich" is 1st pers sg
        && (tokens[posIch].getToken().equals("ich")
            || tokens[posIch].getStartPos() == 0) // ignore "lyrisches Ich" etc.
        && !isQuotationMark(tokens[posIch - 1])) {
      final int plus1 = ((posIch + 1) == tokens.length) ? 0 : +1; // prevent posIch+1 segfault
      if (!verbDoesMatchPersonAndNumber(tokens[posIch - 1], tokens[posIch + plus1], "1", "SIN")) {
        ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posIch], finiteVerb, "1:SIN"));
      }
    }

    if (posVer2Sin != -1 && posDu == -1 && !isQuotationMark(tokens[posVer2Sin - 1])) {
      ruleMatches.add(ruleMatchWrongVerb(tokens[posVer2Sin]));
    } else if (posDu > 0
        && !isNear(posPossibleVer2Sin, posDu)
        && !isQuotationMark(tokens[posDu - 1])) {
      final int plus1 = ((posDu + 1) == tokens.length) ? 0 : +1;
      if (!verbDoesMatchPersonAndNumber(tokens[posDu - 1], tokens[posDu + plus1], "2", "SIN")
          && !tokens[posDu + plus1].hasPartialPosTag("VER:1:SIN:KJ2")
          && // "Wenn ich du wäre"
          !tokens[posDu - 1].hasPartialPosTag("VER:1:SIN:KJ2")) {
        ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posDu], finiteVerb, "2:SIN"));
      }
    }

    if (posEr > 0 && !isNear(posPossibleVer3Sin, posEr) && !isQuotationMark(tokens[posEr - 1])) {
      final int plus1 = ((posEr + 1) == tokens.length) ? 0 : +1;
      if (!verbDoesMatchPersonAndNumber(tokens[posEr - 1], tokens[posEr + plus1], "3", "SIN")) {
        ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posEr], finiteVerb, "3:SIN"));
      }
    }

    if (posVer1Plu != -1 && posWir == -1 && !isQuotationMark(tokens[posVer1Plu - 1])) {
      ruleMatches.add(ruleMatchWrongVerb(tokens[posVer1Plu]));
    } else if (posWir > 0
        && !isNear(posPossibleVer1Plu, posWir)
        && !isQuotationMark(tokens[posWir - 1])) {
      final int plus1 = ((posWir + 1) == tokens.length) ? 0 : +1;
      if (!verbDoesMatchPersonAndNumber(tokens[posWir - 1], tokens[posWir + plus1], "1", "PLU")) {
        ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posWir], finiteVerb, "1:PLU"));
      }
    }

    return toRuleMatchArray(ruleMatches);
  }
  @Override
  public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

    RuleMatch prevRuleMatch = null;
    Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
      AnalyzedTokenReadings token;
      // we need to extend the token list so we find matches at the end of the original list:
      if (i >= tokens.length) {
        token =
            new AnalyzedTokenReadings(
                new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
      } else {
        token = tokens[i];
      }
      if (i == 0) {
        addToQueue(token, prevTokens);
        continue;
      }
      if (token.isImmunized()) {
        continue;
      }

      AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
      List<String> stringsToCheck = new ArrayList<>();
      List<String> origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling
      Map<String, AnalyzedTokenReadings> stringToToken =
          getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
      // iterate backwards over all potentially incorrect strings to make
      // sure we match longer strings first:
      for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
        String stringToCheck = stringsToCheck.get(k);
        String origStringToCheck = origStringsToCheck.get(k);
        if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
          AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
          String msg = null;
          List<String> replacement = new ArrayList<>();
          if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
            replacement.add(origStringToCheck.replace(' ', '-'));
            msg = withHyphenMessage;
          }
          if (isNotAllUppercase(origStringToCheck)
              && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
            replacement.add(mergeCompound(origStringToCheck));
            msg = withoutHyphenMessage;
          }
          String[] parts = stringToCheck.split(" ");
          if (parts.length > 0 && parts[0].length() == 1) {
            replacement.clear();
            replacement.add(origStringToCheck.replace(' ', '-'));
            msg = withHyphenMessage;
          } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen
            msg = withOrWithoutHyphenMessage;
          }
          RuleMatch ruleMatch =
              new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
          ruleMatch.setSuggestedReplacements(replacement);
          // avoid duplicate matches:
          if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
            prevRuleMatch = ruleMatch;
            break;
          }
          prevRuleMatch = ruleMatch;
          ruleMatches.add(ruleMatch);
          break;
        }
      }
      addToQueue(token, prevTokens);
    }
    return toRuleMatchArray(ruleMatches);
  }