/** * Extracts location names from unstructured text using the named entity recognizer (NER) feature * provided by the Apache OpenNLP Name Finder. * * @param plainText Contents of text document * @return List of location names and positions */ public List<LocationOccurrence> extractLocationNames(String plainText) { if (plainText == null) { throw new IllegalArgumentException( "plaintext input to extractLocationNames should not be null"); } List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>(); // The values used in these Spans are string character offsets Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText); // Each sentence gets processed on its own for (Span sentenceSpan : sentenceSpans) { // find the start and end position of this sentence in the document String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd()); // tokenize the text into the required OpenNLP format String[] tokens = tokenizer.tokenize(sentence); // the values used in these Spans are string character offsets of each token from the sentence // beginning Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence); // find the location names in the tokenized text // the values used in these Spans are NOT string character offsets, they are indices into the // 'tokens' array Span names[] = nameFinder.find(tokens); // for each name that got found, create our corresponding occurrence for (Span name : names) { // find offsets relative to the start of the sentence int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart(); int endOfLastWord = tokenPositionsWithinSentence[name.getEnd() - 1] .getEnd(); // -1 because the high end of a Span is noninclusiv // to get offsets relative to the document as a whole, just add the offset for the sentence // itself int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord; int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord; // look back into the original input string to figure out what the text is that I got a hit // on String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc); // add to List of results to return nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc)); } } // this is necessary to maintain consistent results across // multiple runs on the same data, which is what we want nameFinder.clearAdaptiveData(); return nerResults; }
public void analyze(Document document) { List<Sentence> sentences = document.getSentences(); for (Sentence sentence : sentences) { Span[] contractionsSpan; synchronized (this.contractionFinder) { contractionsSpan = contractionFinder.find(TextUtils.tokensToString(sentence.getTokens())); } List<Token> newTokens = sentence.getTokens(); for (int i = contractionsSpan.length - 1; i >= 0; i--) { int start = contractionsSpan[i].getStart(); String lexeme = sentence.getTokens().get(start).getLexeme(); String[] contractions = ContractionUtility.expand(lexeme); Token original = newTokens.remove(start); if (contractions != null) { for (int j = contractions.length - 1; j >= 0; j--) { Token token = new TokenImpl(original.getStart(), original.getEnd(), contractions[j]); newTokens.add(start, token); String caze = null; if (j == 0) caze = "B"; else if (j == contractions.length - 1) caze = "E"; else caze = "I"; token.addContext(Analyzers.CONTRACTION_FINDER, caze); } } else { LOGGER.debug("Missing contraction: " + lexeme); } } sentence.setTokens(newTokens); } }