public static List<Token> groupTokens( String text, List<Token> toks, List<? extends Span> spans, String additionalContext) { for (int i = spans.size() - 1; i >= 0; i--) { Span span = spans.get(i); if (span.length() > 0) { int s = toks.get(span.getStart()).getSpan().getStart(); int e = toks.get(span.getEnd() - 1).getSpan().getEnd(); StringBuilder lexeme = new StringBuilder(); for (int j = span.getStart(); j < span.getEnd() - 1; j++) { lexeme.append(toks.get(j).getLexeme()).append("_"); } lexeme.append(toks.get(span.getEnd() - 1).getLexeme()); for (int j = span.getEnd() - 1; j >= span.getStart(); j--) { toks.remove(j); } Token t = new TokenCogroo(lexeme.toString(), new Span(s, e)); if (span instanceof TypedSpan) { t.setOriginalPOSTag(((TypedSpan) span).getType()); } if (additionalContext != null) { t.setAdditionalContext(additionalContext); } toks.add(span.getStart(), t); } } return toks; }
public static List<Token> groupTokensChar( String text, List<Token> toks, List<Span> charSpans, String additionalContext) { if (charSpans == null || charSpans.size() == 0) { return toks; } int lastVisitedTok = 0; List<Span> spans = new ArrayList<Span>(charSpans.size()); for (Span ch : charSpans) { // System.out.println("looking for: " + ch.getCoveredText(text)); Token aToken = toks.get(lastVisitedTok); while (aToken.getSpan().getStart() < ch.getStart()) { lastVisitedTok++; aToken = toks.get(lastVisitedTok); } int start = lastVisitedTok; while (aToken.getSpan().getEnd() < ch.getEnd()) { lastVisitedTok++; aToken = toks.get(lastVisitedTok); } int end = lastVisitedTok + 1; Span tokSpan = new Span(start, end); spans.add(tokSpan); } return groupTokens(text, toks, spans, additionalContext); }
private List<Section> detectSentences(AnalysedText at, String language) { SentenceDetector sentenceDetector = getSentenceDetector(language); List<Section> sentences; if (sentenceDetector != null) { sentences = new ArrayList<Section>(); for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) { Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd()); log.trace(" > add {}", sentence); sentences.add(sentence); } } else { sentences = null; } return sentences; }
/** * Extracts location names from unstructured text using the named entity recognizer (NER) feature * provided by the Apache OpenNLP Name Finder. * * @param plainText Contents of text document * @return List of location names and positions */ public List<LocationOccurrence> extractLocationNames(String plainText) { if (plainText == null) { throw new IllegalArgumentException( "plaintext input to extractLocationNames should not be null"); } List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>(); // The values used in these Spans are string character offsets Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText); // Each sentence gets processed on its own for (Span sentenceSpan : sentenceSpans) { // find the start and end position of this sentence in the document String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd()); // tokenize the text into the required OpenNLP format String[] tokens = tokenizer.tokenize(sentence); // the values used in these Spans are string character offsets of each token from the sentence // beginning Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence); // find the location names in the tokenized text // the values used in these Spans are NOT string character offsets, they are indices into the // 'tokens' array Span names[] = nameFinder.find(tokens); // for each name that got found, create our corresponding occurrence for (Span name : names) { // find offsets relative to the start of the sentence int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart(); int endOfLastWord = tokenPositionsWithinSentence[name.getEnd() - 1] .getEnd(); // -1 because the high end of a Span is noninclusiv // to get offsets relative to the document as a whole, just add the offset for the sentence // itself int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord; int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord; // look back into the original input string to figure out what the text is that I got a hit // on String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc); // add to List of results to return nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc)); } } // this is necessary to maintain consistent results across // multiple runs on the same data, which is what we want nameFinder.clearAdaptiveData(); return nerResults; }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(annotationSetName); String text = document.getContent().toString(); Span[] tokens = tokenizer.getTokens(text); try { for (Span token : tokens) { FeatureMap features = Factory.newFeatureMap(); features.put( ANNIEConstants.TOKEN_STRING_FEATURE_NAME, text.substring(token.getStart(), token.getEnd())); outputAS.add( (long) token.getStart(), (long) token.getEnd(), ANNIEConstants.TOKEN_ANNOTATION_TYPE, features); } } catch (Exception e) { throw new ExecutionException("error running tokenizer", e); } }
/** * Adds named entity information to parses. * * @param tag named entity type * @param names spans of tokens that are named entities * @param tokens parses for the tokens */ private static void addNames(String tag, List names, Parse[] tokens) { for (int i = 0; i < names.size(); i++) { Span nameTokenSpan = (Span) names.get(i); Parse startToken = tokens[nameTokenSpan.getStart()]; Parse endToken = tokens[nameTokenSpan.getEnd()]; Parse commonP = startToken.getCommonParent(endToken); if (commonP != null) { Span nameSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd()); if (nameSpan.equals(commonP.getSpan())) { // common parent matches exactly the named entity commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0)); } else { // common parent includes the named entity Parse[] kids = commonP.getChildren(); boolean crossingKids = false; for (int j = 0; j < kids.length; j++) if (nameSpan.crosses(kids[j].getSpan())) crossingKids = true; if (!crossingKids) { // named entity does not cross children commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0)); } else { // NE crosses children if (commonP.getType().equals("NP")) { Parse[] grandKids = kids[0].getChildren(); Parse last = grandKids[grandKids.length - 1]; if (grandKids.length > 1 && nameSpan.contains(last.getSpan())) commonP.insert(new Parse(commonP.getText(), commonP.getSpan(), tag, 1.0)); } } } } } }
public String[] tokenize(String s) { return Span.spansToStrings(tokenizePos(s), s); }