private boolean matchLemma(Token token, String primitiveMask) { boolean match = false; String[] lemmas = token.getPrimitive(); if (lemmas != null) { for (String lemma : lemmas) { if (lemma.equalsIgnoreCase(primitiveMask)) { match = true; break; } } } return match; }
/** * Applies all active rules described in Rules.xml given a sentence properly tokenized, tagged, * chunked and shallow parsed. * * @param sentence a tokenized, tagged, chunked and shallow parsed sentence. * @param dictionary a word and tag dictionary * @return a list containing all the mistakes found in the sentence. Each mistake can be localized * between the character indexes given in the span field of the mistake. */ public List<Mistake> check(Sentence sentence) { long start = 0; if (LOGGER.isDebugEnabled()) { start = System.nanoTime(); } // Insert two empty tokens at the sentence start and end List<Token> tokens = new ArrayList<Token>(); Token empty1 = new TokenCogroo("", new Span(0, 0)); empty1.setMorphologicalTag(new MorphologicalTag()); tokens.add(empty1); tokens.addAll(sentence.getTokens()); Token empty2 = new TokenCogroo("", new Span(0, 0)); empty2.setMorphologicalTag(new MorphologicalTag()); tokens.add(empty2); sentence.setTokens(tokens); // mistakes will hold mistakes found in the sentence. List<Mistake> mistakes = new ArrayList<Mistake>(); // rules will hold the tree being used to seek for mistakes. RulesTree rulesTree; if (RulesProperties.APPLY_LOCAL) { // Seeks for errors that can occur anywhere in the sentence (general). rulesTree = this.rulesTreesProvider.getTrees().getGeneral(); // For each token in the sentence. for (int i = 0; i < sentence.getTokens().size(); i++) { // For each token, gets back to the initial state (hence 0). List<State> nextStates = rulesTree.getRoot().getNextStates(); // i is the index of the token that began the rule applying process. mistakes = this.getMistakes(mistakes, nextStates, sentence, i, i, sentence); } } // remove aux tokens sentence.setTokens(sentence.getTokens().subList(1, sentence.getTokens().size() - 1)); if (RulesProperties.APPLY_PHRASE_LOCAL) { // Seeks for errors inside a chunk (phrase local). rulesTree = this.rulesTreesProvider.getTrees().getPhraseLocal(); // For each chunk in the sentence. List<Chunk> chunks = sentence.getChunks(); for (int i = 0; i < chunks.size(); i++) { for (int j = 0; j < chunks.get(i).getTokens().size(); j++) { // For each token, gets back to the initial state (hence 0). List<State> nextStates = rulesTree.getRoot().getNextStates(); // j is the index of the token that began the rule applying process. mistakes = this.getMistakes(mistakes, nextStates, chunks.get(i), j, j, sentence); } } } if (RulesProperties.APPLY_SUBJECT_VERB) { // Seeks for errors between a subject and a main verb. rulesTree = this.rulesTreesProvider.getTrees().getSubjectVerb(); // For each chunk in the sentence. List<SyntacticChunk> syntacticChunks = sentence.getSyntacticChunks(); for (int i = 0; i < syntacticChunks.size(); i++) { List<State> nextStates = rulesTree.getRoot().getNextStates(); mistakes = this.getMistakes(mistakes, nextStates, syntacticChunks, i, i, sentence); } } if (LOGGER.isDebugEnabled()) { LOGGER.debug("Rules applied in " + (System.nanoTime() - start) / 1000 + "us"); } filterIgnoredRules(mistakes); return mistakes; }
/** * Determines if a token is matched by a rule element. * * @param token the token to be matched by the element * @param element the element to be matched against the token * @return <code>true</code> if there's a match, <code>false</code> otherwise */ private boolean match(Token token, Element element, int baseTokenIndex, Sentence sentence) { boolean match; boolean negated; // Sees if the mask must or not match. // Negated is optional, so it can be null, true or false. // If null, consider as false. if (element.isNegated() == null) { match = false; negated = false; } else { match = element.isNegated().booleanValue(); negated = element.isNegated().booleanValue(); } for (Mask mask : element.getMask()) { // If the token must match the mask. if (!negated) { // If not negated, match starts as false and just one match is needed to make it true. if (mask.getLexemeMask() != null && mask.getLexemeMask().equalsIgnoreCase(token.getLexeme())) { match = true; } else if (mask.getPrimitiveMask() != null && matchLemma(token, mask.getPrimitiveMask())) { match = true; } else if (mask.getTagMask() != null && token.getMorphologicalTag() != null) { match = match | token.getMorphologicalTag().matchExact(mask.getTagMask(), false); } else if (mask.getTagReference() != null && token.getMorphologicalTag() != null) { match = match | token .getMorphologicalTag() .match( RuleUtils.createTagMaskFromReference( mask.getTagReference(), sentence, baseTokenIndex), false); } else if (mask.getOutOfBounds() != null && (baseTokenIndex == 0 || baseTokenIndex == sentence.getTokens().size() - 1)) { match = false; } } else { // The token must NOT match the mask. // If negated, match starts as true and just one match is needed to make it false. if (mask.getLexemeMask() != null && mask.getLexemeMask().equalsIgnoreCase(token.getLexeme())) { match = false; } else if (mask.getPrimitiveMask() != null && matchLemma(token, mask.getPrimitiveMask())) { match = false; } else if (mask.getTagMask() != null && token != null && token.getMorphologicalTag() != null) { match = match & !token.getMorphologicalTag().matchExact(mask.getTagMask(), false); } else if (mask.getTagReference() != null && token != null && token.getMorphologicalTag() != null) { match = match & !token .getMorphologicalTag() .match( RuleUtils.createTagMaskFromReference( mask.getTagReference(), sentence, baseTokenIndex), false); } else if (mask.getOutOfBounds() != null && (baseTokenIndex == 0 || baseTokenIndex == sentence.getTokens().size() - 1)) { match = false; } } } return match; }