// TODO: roll check into tokens regex pattern? // That allows for better matching because unmatched sequences will be eliminated at match time private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) { if (validPosPattern != null) { // Need to check POS tag too... switch (posMatchType) { case MATCH_ONE_TOKEN_PHRASE_ONLY: if (tokens.size() > 1) return true; // fall through case MATCH_AT_LEAST_ONE_TOKEN: for (int i = start; i < end; i++) { CoreLabel token = tokens.get(i); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (pos != null && validPosPattern.matcher(pos).matches()) { return true; } } return false; case MATCH_ALL_TOKENS: // Checked else where return true; default: // Don't know this match type.... return true; } } return true; }
private void annotateMatched(List<CoreLabel> tokens) { List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens); for (SequenceMatchResult<CoreMap> m : matched) { Entry entry = patternToEntry.get(m.pattern()); // Check if we will overwrite the existing annotation with this annotation int g = entry.annotateGroup; int start = m.start(g); int end = m.end(g); boolean overwriteOriginalNer = checkPosTags(tokens, start, end); if (overwriteOriginalNer) { overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end); } if (overwriteOriginalNer) { for (int i = start; i < end; i++) { tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type); } } else { if (verbose) { System.err.println( "Not annotating '" + m.group(g) + "': " + StringUtils.joinFields( m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.type + ", sentence is '" + StringUtils.joinWords(tokens, " ") + '\''); } } } }
@SuppressWarnings("unused") private static String getText(List<? extends CoreMap> list, int index) { return list.get(index).get(CoreAnnotations.TextAnnotation.class); }
private static TimeExpression getTimeExpression(List<? extends CoreMap> list, int index) { return list.get(index).get(TimeExpression.Annotation.class); }
private boolean checkOrigNerTags(Entry entry, List<CoreLabel> tokens, int start, int end) { int prevNerEndIndex = start - 1; int nextNerStartIndex = end; // Check if we found a pattern that overlaps with existing ner labels // tag1 tag1 x x tag2 tag2 // tag tag tag tag // Don't overwrite the old ner label if we overlap like this String startNer = tokens.get(start).ner(); String endNer = tokens.get(end - 1).ner(); if (startNer != null && !myLabels.contains(startNer)) { while (prevNerEndIndex >= 0) { // go backwards to find different entity type String ner = tokens.get(prevNerEndIndex).ner(); if (ner == null || !ner.equals(startNer)) { break; } prevNerEndIndex--; } } if (endNer != null && !myLabels.contains(endNer)) { while (nextNerStartIndex < tokens.size()) { // go backwards to find different entity type String ner = tokens.get(nextNerStartIndex).ner(); if (ner == null || !ner.equals(endNer)) { break; } nextNerStartIndex++; } } boolean overwriteOriginalNer = false; //noinspection StatementWithEmptyBody if (prevNerEndIndex != (start - 1) || nextNerStartIndex != end) { // Cutting across already recognized NEs don't disturb } else if (startNer == null) { // No old ner, okay to replace overwriteOriginalNer = true; } else { // Check if we have one consistent NER tag // if not, overwrite // if consistent, overwrite only if in our set of ner tags that we overwrite for (int i = start + 1; i < end; i++) { if (!startNer.equals(tokens.get(i).ner())) { overwriteOriginalNer = true; break; } } if (!overwriteOriginalNer) { // check if old ner type was one that was specified as explicitly overwritable by this entry if (entry.overwritableTypes.contains(startNer)) { overwriteOriginalNer = true; } else { // if this ner type doesn't belong to the labels for which we don't overwrite the default // labels (noDefaultOverwriteLabels) // we check mylabels to see if we can overwrite this entry if ( /*entry.overwritableTypes.isEmpty() || */ !noDefaultOverwriteLabels.contains( entry.type)) { overwriteOriginalNer = myLabels.contains(startNer); } } } } return overwriteOriginalNer; }