Пример #1
0
  public List<Mistake> check(Sentence sentence) {

    List<Mistake> mistakes = new LinkedList<Mistake>();
    int offset = sentence.getSpan().getStart();

    List<Token> tokens = sentence.getTokens();

    String token = tokens.get(0).getLexeme().toLowerCase();

    for (int i = 1; i < tokens.size(); i++) {
      String next = tokens.get(i).getLexeme().toLowerCase();

      if (token.equals(next) && !isException(tokens, i)) {

        int start = tokens.get(i - 1).getSpan().getStart() + offset;
        int end = tokens.get(i).getSpan().getEnd() + offset;

        mistakes.add(
            createMistake(
                ID,
                createSuggestion(tokens.get(i - 1).getLexeme()),
                start,
                end,
                sentence.getSentence()));
      }

      token = next;
    }

    return mistakes;
  }
Пример #2
0
  /**
   * Applies all active rules described in Rules.xml given a sentence properly tokenized, tagged,
   * chunked and shallow parsed.
   *
   * @param sentence a tokenized, tagged, chunked and shallow parsed sentence.
   * @param dictionary a word and tag dictionary
   * @return a list containing all the mistakes found in the sentence. Each mistake can be localized
   *     between the character indexes given in the span field of the mistake.
   */
  public List<Mistake> check(Sentence sentence) {
    long start = 0;
    if (LOGGER.isDebugEnabled()) {
      start = System.nanoTime();
    }

    // Insert two empty tokens at the sentence start and end
    List<Token> tokens = new ArrayList<Token>();
    Token empty1 = new TokenCogroo("", new Span(0, 0));
    empty1.setMorphologicalTag(new MorphologicalTag());
    tokens.add(empty1);
    tokens.addAll(sentence.getTokens());
    Token empty2 = new TokenCogroo("", new Span(0, 0));
    empty2.setMorphologicalTag(new MorphologicalTag());
    tokens.add(empty2);
    sentence.setTokens(tokens);

    // mistakes will hold mistakes found in the sentence.
    List<Mistake> mistakes = new ArrayList<Mistake>();

    // rules will hold the tree being used to seek for mistakes.
    RulesTree rulesTree;

    if (RulesProperties.APPLY_LOCAL) {
      // Seeks for errors that can occur anywhere in the sentence (general).
      rulesTree = this.rulesTreesProvider.getTrees().getGeneral();
      // For each token in the sentence.
      for (int i = 0; i < sentence.getTokens().size(); i++) {
        // For each token, gets back to the initial state (hence 0).
        List<State> nextStates = rulesTree.getRoot().getNextStates();
        // i is the index of the token that began the rule applying process.
        mistakes = this.getMistakes(mistakes, nextStates, sentence, i, i, sentence);
      }
    }

    // remove aux tokens
    sentence.setTokens(sentence.getTokens().subList(1, sentence.getTokens().size() - 1));

    if (RulesProperties.APPLY_PHRASE_LOCAL) {
      // Seeks for errors inside a chunk (phrase local).
      rulesTree = this.rulesTreesProvider.getTrees().getPhraseLocal();
      // For each chunk in the sentence.
      List<Chunk> chunks = sentence.getChunks();
      for (int i = 0; i < chunks.size(); i++) {
        for (int j = 0; j < chunks.get(i).getTokens().size(); j++) {
          // For each token, gets back to the initial state (hence 0).
          List<State> nextStates = rulesTree.getRoot().getNextStates();
          // j is the index of the token that began the rule applying process.
          mistakes = this.getMistakes(mistakes, nextStates, chunks.get(i), j, j, sentence);
        }
      }
    }

    if (RulesProperties.APPLY_SUBJECT_VERB) {
      // Seeks for errors between a subject and a main verb.
      rulesTree = this.rulesTreesProvider.getTrees().getSubjectVerb();
      // For each chunk in the sentence.
      List<SyntacticChunk> syntacticChunks = sentence.getSyntacticChunks();
      for (int i = 0; i < syntacticChunks.size(); i++) {
        List<State> nextStates = rulesTree.getRoot().getNextStates();
        mistakes = this.getMistakes(mistakes, nextStates, syntacticChunks, i, i, sentence);
      }
    }

    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug("Rules applied in " + (System.nanoTime() - start) / 1000 + "us");
    }
    filterIgnoredRules(mistakes);

    return mistakes;
  }
Пример #3
0
 /**
  * A recursive method that iterates the sentence given a base chunk. Used to match subject-verb
  * rules.
  *
  * @param mistakes a list of mistakes found in the process of checking the sentence
  * @param currentStates the applier will check if these states match the current token
  * @param syntacticChunks an array of chunks
  * @param baseChunkIndex the index of the chunk in which the process of searching for mistakes
  *     began
  * @param currentChunkIndex the index of the current chunk
  * @param sentence the complete sentence, used to get the location of the mistake counted by chars
  *     inside the sentence
  * @return the mistakes in the parameter <code>mistakes</code> plus the mistakes found in this
  *     invocation, if any
  */
 private List<Mistake> getMistakes(
     List<Mistake> mistakes,
     List<State> currentStates,
     List<SyntacticChunk> syntacticChunks,
     int baseChunkIndex,
     int currentChunkIndex,
     Sentence sentence) {
   for (State state : currentStates) {
     boolean chunkAndElementMatched =
         this.match(
             syntacticChunks.get(currentChunkIndex), state.getElement(), baseChunkIndex, sentence);
     if (chunkAndElementMatched) {
       if (state instanceof AcceptState) {
         // Got a mistake!
         Rule rule = ((AcceptState) state).getRule();
         // The mistake is located between the chunks indicated by lower and upper.
         // Gets the lower index by chars.
         int lower =
             sentence
                 .getSyntacticChunks()
                 .get(baseChunkIndex + rule.getBoundaries().getLower())
                 .getFirstToken();
         int upper =
             sentence.getSyntacticChunks().get(currentChunkIndex).getFirstToken()
                 + rule.getBoundaries().getUpper();
         int lowerCountedByChars = sentence.getTokens().get(lower).getSpan().getStart();
         // Gets the upper index by chars.
         SyntacticChunk chunkUpper = sentence.getSyntacticChunks().get(currentChunkIndex);
         int upperCountedByChars =
             chunkUpper.getTokens().get(chunkUpper.getTokens().size() - 1).getSpan().getEnd();
         // Suggestions.
         String[] suggestions =
             SuggestionBuilder.getSuggestions(
                 sentence,
                 true,
                 baseChunkIndex,
                 lower,
                 upper,
                 rule.getSuggestion(),
                 dictionary,
                 Method.SUBJECT_VERB);
         Mistake mistake =
             new MistakeImpl(
                 ID_PREFIX + rule.getId(),
                 getPriority(rule),
                 rule.getMessage(),
                 rule.getShortMessage(),
                 suggestions,
                 lowerCountedByChars + sentence.getOffset(),
                 upperCountedByChars + sentence.getOffset(),
                 rule.getExample(),
                 sentence.getSentence());
         mistakes.add(mistake);
       } else if (currentChunkIndex + 1 < syntacticChunks.size()) {
         // Keep looking: recurse.
         this.getMistakes(
             mistakes,
             state.getNextStates(),
             syntacticChunks,
             baseChunkIndex,
             currentChunkIndex + 1,
             sentence);
       }
     }
   }
   return mistakes;
 }
Пример #4
0
 /**
  * Determines if a token is matched by a rule element.
  *
  * @param token the token to be matched by the element
  * @param element the element to be matched against the token
  * @return <code>true</code> if there's a match, <code>false</code> otherwise
  */
 private boolean match(Token token, Element element, int baseTokenIndex, Sentence sentence) {
   boolean match;
   boolean negated;
   // Sees if the mask must or not match.
   // Negated is optional, so it can be null, true or false.
   // If null, consider as false.
   if (element.isNegated() == null) {
     match = false;
     negated = false;
   } else {
     match = element.isNegated().booleanValue();
     negated = element.isNegated().booleanValue();
   }
   for (Mask mask : element.getMask()) {
     // If the token must match the mask.
     if (!negated) {
       // If not negated, match starts as false and just one match is needed to make it true.
       if (mask.getLexemeMask() != null
           && mask.getLexemeMask().equalsIgnoreCase(token.getLexeme())) {
         match = true;
       } else if (mask.getPrimitiveMask() != null && matchLemma(token, mask.getPrimitiveMask())) {
         match = true;
       } else if (mask.getTagMask() != null && token.getMorphologicalTag() != null) {
         match = match | token.getMorphologicalTag().matchExact(mask.getTagMask(), false);
       } else if (mask.getTagReference() != null && token.getMorphologicalTag() != null) {
         match =
             match
                 | token
                     .getMorphologicalTag()
                     .match(
                         RuleUtils.createTagMaskFromReference(
                             mask.getTagReference(), sentence, baseTokenIndex),
                         false);
       } else if (mask.getOutOfBounds() != null
           && (baseTokenIndex == 0 || baseTokenIndex == sentence.getTokens().size() - 1)) {
         match = false;
       }
     } else { // The token must NOT match the mask.
       // If negated, match starts as true and just one match is needed to make it false.
       if (mask.getLexemeMask() != null
           && mask.getLexemeMask().equalsIgnoreCase(token.getLexeme())) {
         match = false;
       } else if (mask.getPrimitiveMask() != null && matchLemma(token, mask.getPrimitiveMask())) {
         match = false;
       } else if (mask.getTagMask() != null
           && token != null
           && token.getMorphologicalTag() != null) {
         match = match & !token.getMorphologicalTag().matchExact(mask.getTagMask(), false);
       } else if (mask.getTagReference() != null
           && token != null
           && token.getMorphologicalTag() != null) {
         match =
             match
                 & !token
                     .getMorphologicalTag()
                     .match(
                         RuleUtils.createTagMaskFromReference(
                             mask.getTagReference(), sentence, baseTokenIndex),
                         false);
       } else if (mask.getOutOfBounds() != null
           && (baseTokenIndex == 0 || baseTokenIndex == sentence.getTokens().size() - 1)) {
         match = false;
       }
     }
   }
   return match;
 }
Пример #5
0
  /**
   * A recursive method that iterates the sentence given a base token group (sentence or chunk).
   * Used to match general and phrase local rules.
   *
   * @param mistakes a list of mistakes found in the process of checking the sentence
   * @param currentStates the applier will check if these states match the current token
   * @param tokenGroup can be a sentence or a chunk (classes that implement the interface
   *     TokenGroup)
   * @param baseTokenIndex the index of the token in which the process of searching for mistakes
   *     began
   * @param currentTokenIndex the index of the current token group
   * @param sentence the complete sentence, used to get the location of the mistake counted by chars
   *     inside the sentence
   * @param dictionary a word and tag dictionary.
   * @return the mistakes in the parameter <code>mistakes</code> plus the mistakes found in this
   *     invocation, if any
   */
  private List<Mistake> getMistakes(
      List<Mistake> mistakes,
      List<State> currentStates,
      TokenGroup tokenGroup,
      int baseTokenIndex,
      int currentTokenIndex,
      Sentence sentence) {
    Method method = Method.GENERAL;

    int offset = 0;
    if (tokenGroup instanceof Chunk) {
      offset = ((Chunk) tokenGroup).getFirstToken();
      method = Method.PHRASE_LOCAL;
    }

    for (State state : currentStates) {
      boolean tokenAndElementMatched =
          this.match(
              tokenGroup.getTokens().get(currentTokenIndex),
              state.getElement(),
              baseTokenIndex + offset,
              sentence);
      if (tokenAndElementMatched) {
        if (state instanceof AcceptState) {
          // Got a mistake!
          Rule rule = ((AcceptState) state).getRule();
          // The mistake is located between the tokens indicated by lower and upper.
          int lower = baseTokenIndex + rule.getBoundaries().getLower();
          int upper = currentTokenIndex + rule.getBoundaries().getUpper();
          lower += offset;
          upper += offset;
          // Pointing the mistake location using the chars in the sentence.
          int lowerCountedByChars = sentence.getTokens().get(lower).getSpan().getStart();
          int upperCountedByChars = sentence.getTokens().get(upper).getSpan().getEnd();
          // Suggestions.
          String[] suggestions = new String[0];
          try {
            suggestions =
                SuggestionBuilder.getSuggestions(
                    sentence,
                    false,
                    baseTokenIndex,
                    lower,
                    upper,
                    rule.getSuggestion(),
                    dictionary,
                    method);
          } catch (NullPointerException e) {
            LOGGER.error(
                "Failed to apply rule " + rule.getId() + " in: " + sentence.getSentence(), e);
          }

          Mistake mistake =
              new MistakeImpl(
                  ID_PREFIX + rule.getId(),
                  getPriority(rule),
                  rule.getMessage(),
                  rule.getShortMessage(),
                  suggestions,
                  lowerCountedByChars + sentence.getOffset(),
                  upperCountedByChars + sentence.getOffset(),
                  rule.getExample(),
                  sentence.getSentence());
          mistakes.add(mistake);
        } else if (currentTokenIndex + 1 < tokenGroup.getTokens().size()) {
          // Keep looking: recurse.
          this.getMistakes(
              mistakes,
              state.getNextStates(),
              tokenGroup,
              baseTokenIndex,
              currentTokenIndex + 1,
              sentence);
        }
      }
    }
    return mistakes;
  }