Пример #1
0
 private boolean matchLemma(Token token, String primitiveMask) {
   boolean match = false;
   String[] lemmas = token.getPrimitive();
   if (lemmas != null) {
     for (String lemma : lemmas) {
       if (lemma.equalsIgnoreCase(primitiveMask)) {
         match = true;
         break;
       }
     }
   }
   return match;
 }
Пример #2
0
  /**
   * Applies all active rules described in Rules.xml given a sentence properly tokenized, tagged,
   * chunked and shallow parsed.
   *
   * @param sentence a tokenized, tagged, chunked and shallow parsed sentence.
   * @param dictionary a word and tag dictionary
   * @return a list containing all the mistakes found in the sentence. Each mistake can be localized
   *     between the character indexes given in the span field of the mistake.
   */
  public List<Mistake> check(Sentence sentence) {
    long start = 0;
    if (LOGGER.isDebugEnabled()) {
      start = System.nanoTime();
    }

    // Insert two empty tokens at the sentence start and end
    List<Token> tokens = new ArrayList<Token>();
    Token empty1 = new TokenCogroo("", new Span(0, 0));
    empty1.setMorphologicalTag(new MorphologicalTag());
    tokens.add(empty1);
    tokens.addAll(sentence.getTokens());
    Token empty2 = new TokenCogroo("", new Span(0, 0));
    empty2.setMorphologicalTag(new MorphologicalTag());
    tokens.add(empty2);
    sentence.setTokens(tokens);

    // mistakes will hold mistakes found in the sentence.
    List<Mistake> mistakes = new ArrayList<Mistake>();

    // rules will hold the tree being used to seek for mistakes.
    RulesTree rulesTree;

    if (RulesProperties.APPLY_LOCAL) {
      // Seeks for errors that can occur anywhere in the sentence (general).
      rulesTree = this.rulesTreesProvider.getTrees().getGeneral();
      // For each token in the sentence.
      for (int i = 0; i < sentence.getTokens().size(); i++) {
        // For each token, gets back to the initial state (hence 0).
        List<State> nextStates = rulesTree.getRoot().getNextStates();
        // i is the index of the token that began the rule applying process.
        mistakes = this.getMistakes(mistakes, nextStates, sentence, i, i, sentence);
      }
    }

    // remove aux tokens
    sentence.setTokens(sentence.getTokens().subList(1, sentence.getTokens().size() - 1));

    if (RulesProperties.APPLY_PHRASE_LOCAL) {
      // Seeks for errors inside a chunk (phrase local).
      rulesTree = this.rulesTreesProvider.getTrees().getPhraseLocal();
      // For each chunk in the sentence.
      List<Chunk> chunks = sentence.getChunks();
      for (int i = 0; i < chunks.size(); i++) {
        for (int j = 0; j < chunks.get(i).getTokens().size(); j++) {
          // For each token, gets back to the initial state (hence 0).
          List<State> nextStates = rulesTree.getRoot().getNextStates();
          // j is the index of the token that began the rule applying process.
          mistakes = this.getMistakes(mistakes, nextStates, chunks.get(i), j, j, sentence);
        }
      }
    }

    if (RulesProperties.APPLY_SUBJECT_VERB) {
      // Seeks for errors between a subject and a main verb.
      rulesTree = this.rulesTreesProvider.getTrees().getSubjectVerb();
      // For each chunk in the sentence.
      List<SyntacticChunk> syntacticChunks = sentence.getSyntacticChunks();
      for (int i = 0; i < syntacticChunks.size(); i++) {
        List<State> nextStates = rulesTree.getRoot().getNextStates();
        mistakes = this.getMistakes(mistakes, nextStates, syntacticChunks, i, i, sentence);
      }
    }

    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug("Rules applied in " + (System.nanoTime() - start) / 1000 + "us");
    }
    filterIgnoredRules(mistakes);

    return mistakes;
  }
Пример #3
0
 /**
  * Determines if a token is matched by a rule element.
  *
  * @param token the token to be matched by the element
  * @param element the element to be matched against the token
  * @return <code>true</code> if there's a match, <code>false</code> otherwise
  */
 private boolean match(Token token, Element element, int baseTokenIndex, Sentence sentence) {
   boolean match;
   boolean negated;
   // Sees if the mask must or not match.
   // Negated is optional, so it can be null, true or false.
   // If null, consider as false.
   if (element.isNegated() == null) {
     match = false;
     negated = false;
   } else {
     match = element.isNegated().booleanValue();
     negated = element.isNegated().booleanValue();
   }
   for (Mask mask : element.getMask()) {
     // If the token must match the mask.
     if (!negated) {
       // If not negated, match starts as false and just one match is needed to make it true.
       if (mask.getLexemeMask() != null
           && mask.getLexemeMask().equalsIgnoreCase(token.getLexeme())) {
         match = true;
       } else if (mask.getPrimitiveMask() != null && matchLemma(token, mask.getPrimitiveMask())) {
         match = true;
       } else if (mask.getTagMask() != null && token.getMorphologicalTag() != null) {
         match = match | token.getMorphologicalTag().matchExact(mask.getTagMask(), false);
       } else if (mask.getTagReference() != null && token.getMorphologicalTag() != null) {
         match =
             match
                 | token
                     .getMorphologicalTag()
                     .match(
                         RuleUtils.createTagMaskFromReference(
                             mask.getTagReference(), sentence, baseTokenIndex),
                         false);
       } else if (mask.getOutOfBounds() != null
           && (baseTokenIndex == 0 || baseTokenIndex == sentence.getTokens().size() - 1)) {
         match = false;
       }
     } else { // The token must NOT match the mask.
       // If negated, match starts as true and just one match is needed to make it false.
       if (mask.getLexemeMask() != null
           && mask.getLexemeMask().equalsIgnoreCase(token.getLexeme())) {
         match = false;
       } else if (mask.getPrimitiveMask() != null && matchLemma(token, mask.getPrimitiveMask())) {
         match = false;
       } else if (mask.getTagMask() != null
           && token != null
           && token.getMorphologicalTag() != null) {
         match = match & !token.getMorphologicalTag().matchExact(mask.getTagMask(), false);
       } else if (mask.getTagReference() != null
           && token != null
           && token.getMorphologicalTag() != null) {
         match =
             match
                 & !token
                     .getMorphologicalTag()
                     .match(
                         RuleUtils.createTagMaskFromReference(
                             mask.getTagReference(), sentence, baseTokenIndex),
                         false);
       } else if (mask.getOutOfBounds() != null
           && (baseTokenIndex == 0 || baseTokenIndex == sentence.getTokens().size() - 1)) {
         match = false;
       }
     }
   }
   return match;
 }