public void analyze(Document document) { List<Sentence> sentences = document.getSentences(); for (Sentence sentence : sentences) { List<Token> tokens = sentence.getTokens(); for (int i = 0; i < tokens.size(); i++) { String tag = tokens.get(i).getPOSTag(); String word = tokens.get(i).getLexeme(); String[] lemmas = dict.getLemmas(word, tag); if (lemmas == null || lemmas.length == 0) { lemmas = dict.getLemmas(word.toLowerCase(), tag); } tokens.get(i).setLemmas(lemmas); } } }
public void analyze(Document document) { List<Sentence> sentences = document.getSentences(); for (Sentence sentence : sentences) { Span[] contractionsSpan; synchronized (this.contractionFinder) { contractionsSpan = contractionFinder.find(TextUtils.tokensToString(sentence.getTokens())); } List<Token> newTokens = sentence.getTokens(); for (int i = contractionsSpan.length - 1; i >= 0; i--) { int start = contractionsSpan[i].getStart(); String lexeme = sentence.getTokens().get(start).getLexeme(); String[] contractions = ContractionUtility.expand(lexeme); Token original = newTokens.remove(start); if (contractions != null) { for (int j = contractions.length - 1; j >= 0; j--) { Token token = new TokenImpl(original.getStart(), original.getEnd(), contractions[j]); newTokens.add(start, token); String caze = null; if (j == 0) caze = "B"; else if (j == contractions.length - 1) caze = "E"; else caze = "I"; token.addContext(Analyzers.CONTRACTION_FINDER, caze); } } else { LOGGER.debug("Missing contraction: " + lexeme); } } sentence.setTokens(newTokens); } }
public List<Mistake> check(Sentence sentence) { List<Mistake> mistakes = new ArrayList<Mistake>(); if (sentence.getTokens().size() < 2) { return mistakes; } for (int i = 0; i < sentence.getTokens().size(); i++) { Token originalToken = sentence.getTokens().get(i); String wanted = originalToken.getLexeme(); String wantedLowerCase = wanted.toLowerCase(); if (map.containsKey(wantedLowerCase)) { String candidate = RuleUtils.useCasedString(wanted, map.get(wantedLowerCase)); String sentenceText = sentence.getText(); String alternativeText = sentenceText.substring(0, originalToken.getStart()) + candidate + sentenceText.substring(originalToken.getEnd()); if (LOGGER.isDebugEnabled()) { LOGGER.debug("\n****** Sentença alternativa **********:\n" + alternativeText); } Document alternative = new DocumentImpl(alternativeText); this.analyzer.analyze(alternative); Sentence alternativeSentence = alternative.getSentences().get(0); if (alternativeSentence.getTokensProb() - sentence.getTokensProb() > 0.1) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Prob original: " + sentence.getTokensProb()); LOGGER.debug("Prob alternat: " + alternativeSentence.getTokensProb()); LOGGER.debug( "\n****** Possível correção **********:\n" + sentenceText + " -> " + alternativeText); } Token alternativeToken = alternativeSentence.getTokens().get(i); String[] suggestions = {candidate}; String[] longMsgArgs = {wanted, translate(alternativeToken.getPOSTag()), candidate}; String[] shortMsgArgs = {wanted, candidate}; mistakes.add( createMistake( ID, longMsgArgs, shortMsgArgs, suggestions, originalToken.getStart(), originalToken.getEnd(), sentence.getText())); } } } return mistakes; }