// TODO: roll check into tokens regex pattern? // That allows for better matching because unmatched sequences will be eliminated at match time private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) { if (validPosPattern != null) { // Need to check POS tag too... switch (posMatchType) { case MATCH_ONE_TOKEN_PHRASE_ONLY: if (tokens.size() > 1) return true; // fall through case MATCH_AT_LEAST_ONE_TOKEN: for (int i = start; i < end; i++) { CoreLabel token = tokens.get(i); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (pos != null && validPosPattern.matcher(pos).matches()) { return true; } } return false; case MATCH_ALL_TOKENS: // Checked else where return true; default: // Don't know this match type.... return true; } } return true; }
private List<CoreMap> toCoreMaps( CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) { if (timeExpressions == null) return null; List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size()); for (TimeExpression te : timeExpressions) { CoreMap cm = te.getAnnotation(); SUTime.Temporal temporal = te.getTemporal(); if (temporal != null) { String origText = annotation.get(CoreAnnotations.TextAnnotation.class); String text = cm.get(CoreAnnotations.TextAnnotation.class); if (origText != null) { // Make sure the text is from original (and not from concatenated tokens) ChunkAnnotationUtils.annotateChunkText(cm, annotation); text = cm.get(CoreAnnotations.TextAnnotation.class); } Map<String, String> timexAttributes; try { timexAttributes = temporal.getTimexAttributes(timeIndex); if (options.includeRange) { SUTime.Temporal rangeTemporal = temporal.getRange(); if (rangeTemporal != null) { timexAttributes.put("range", rangeTemporal.toString()); } } } catch (Exception e) { logger.log( Level.WARNING, "Failed to get attributes from " + text + ", timeIndex " + timeIndex, e); continue; } Timex timex; try { timex = Timex.fromMap(text, timexAttributes); } catch (Exception e) { logger.log( Level.WARNING, "Failed to process " + text + " with attributes " + timexAttributes, e); continue; } cm.set(TimexAnnotation.class, timex); if (timex != null) { coreMaps.add(cm); } else { logger.warning("No timex expression for: " + text); } } } return coreMaps; }
public SUTime.Temporal apply(MatchResult in) { if (in instanceof SequenceMatchResult) { SequenceMatchResult<CoreMap> mr = (SequenceMatchResult<CoreMap>) (in); if (group >= 0) { List<? extends CoreMap> matched = mr.groupNodes(group); if (matched != null) { int i = (nodeIndex >= 0) ? 0 : (matched.size() + nodeIndex); TimeExpression te = getTimeExpression(matched, i); if (te != null) { return te.getTemporal(); } } } } return null; }
private List<TimeExpression> filterInvalidTimeExpressions(List<TimeExpression> timeExprs) { int nfiltered = 0; List<TimeExpression> filtered = new ArrayList<TimeExpression>(timeExprs.size()); // Approximate size for (TimeExpression timeExpr : timeExprs) { if (timexPatterns.checkTimeExpression(timeExpr)) { filtered.add(timeExpr); } else { nfiltered++; } } if (nfiltered > 0) { logger.finest("Filtered " + nfiltered); } return filtered; }
private void annotateMatched(List<CoreLabel> tokens) { List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens); for (SequenceMatchResult<CoreMap> m : matched) { Entry entry = patternToEntry.get(m.pattern()); // Check if we will overwrite the existing annotation with this annotation int g = entry.annotateGroup; int start = m.start(g); int end = m.end(g); boolean overwriteOriginalNer = checkPosTags(tokens, start, end); if (overwriteOriginalNer) { overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end); } if (overwriteOriginalNer) { for (int i = start; i < end; i++) { tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type); } } else { if (verbose) { System.err.println( "Not annotating '" + m.group(g) + "': " + StringUtils.joinFields( m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.type + ", sentence is '" + StringUtils.joinWords(tokens, " ") + '\''); } } } }
/** * Creates a combined list of Entries using the provided mapping files. * * @param mappings List of mapping files * @return list of Entries */ private static List<Entry> readEntries( String annotatorName, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose, String... mappings) { // Unlike RegexNERClassifier, we don't bother sorting the entries // We leave it to TokensRegex NER to sort out the priorities and matches // (typically after all the matches has been made since for some TokenRegex expression, // we don't know how many tokens are matched until after the matching is done) List<Entry> entries = new ArrayList<>(); TrieMap<String, Entry> seenRegexes = new TrieMap<>(); Arrays.sort(mappings); for (String mapping : mappings) { BufferedReader rd = null; try { rd = IOUtils.readerFromString(mapping); readEntries( annotatorName, entries, seenRegexes, mapping, rd, noDefaultOverwriteLabels, ignoreCase, verbose); } catch (IOException e) { throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e); } finally { IOUtils.closeIgnoringExceptions(rd); } } if (mappings.length != 1) { logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + entries.size() + " unique entries from " + mappings.length + " files"); } return entries; }
public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) { List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation); annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers); // TODO: docDate may not have century.... SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr); List<? extends MatchedExpression> matchedExpressions = expressionExtractor.extractExpressions(annotation); List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size()); for (MatchedExpression expr : matchedExpressions) { if (expr instanceof TimeExpression) { timeExpressions.add((TimeExpression) expr); } else { timeExpressions.add(new TimeExpression(expr)); } } // Add back nested time expressions for ranges.... // For now only one level of nesting... if (options.includeNested) { List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>(); for (TimeExpression te : timeExpressions) { if (te.isIncludeNested()) { List<? extends CoreMap> children = te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class); if (children != null) { for (CoreMap child : children) { TimeExpression childTe = child.get(TimeExpression.Annotation.class); if (childTe != null) { nestedTimeExpressions.add(childTe); } } } } } timeExpressions.addAll(nestedTimeExpressions); } Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR); timeExpressions = filterInvalidTimeExpressions(timeExpressions); // Some resolving is done even if docDate null... if ( /*docDate != null && */ timeExpressions != null) { resolveTimeExpressions(annotation, timeExpressions, docDate); } // Annotate timex return timeExpressions; }
private MultiPatternMatcher<CoreMap> createPatternMatcher( Map<SequencePattern<CoreMap>, Entry> patternToEntry) { // Convert to tokensregex pattern int patternFlags = ignoreCase ? Pattern.CASE_INSENSITIVE : 0; int stringMatchFlags = ignoreCase ? NodePattern.CASE_INSENSITIVE : 0; Env env = TokenSequencePattern.getNewEnv(); env.setDefaultStringPatternFlags(patternFlags); env.setDefaultStringMatchFlags(stringMatchFlags); NodePattern<String> posTagPattern = (validPosPattern != null && PosMatchType.MATCH_ALL_TOKENS.equals(posMatchType)) ? new CoreMapNodePattern.StringAnnotationRegexPattern(validPosPattern) : null; List<TokenSequencePattern> patterns = new ArrayList<>(entries.size()); for (Entry entry : entries) { TokenSequencePattern pattern; if (entry.tokensRegex != null) { // TODO: posTagPatterns... pattern = TokenSequencePattern.compile(env, entry.tokensRegex); } else { List<SequencePattern.PatternExpr> nodePatterns = new ArrayList<>(); for (String p : entry.regex) { CoreMapNodePattern c = CoreMapNodePattern.valueOf(p, patternFlags); if (posTagPattern != null) { c.add(CoreAnnotations.PartOfSpeechAnnotation.class, posTagPattern); } nodePatterns.add(new SequencePattern.NodePatternExpr(c)); } pattern = TokenSequencePattern.compile(new SequencePattern.SequencePatternExpr(nodePatterns)); } if (entry.annotateGroup < 0 || entry.annotateGroup > pattern.getTotalGroups()) { throw new RuntimeException("Invalid match group for entry " + entry); } pattern.setPriority(entry.priority); patterns.add(pattern); patternToEntry.put(pattern, entry); } return TokenSequencePattern.getMultiPatternMatcher(patterns); }
@SuppressWarnings("unused") private static String getText(List<? extends CoreMap> list, int index) { return list.get(index).get(CoreAnnotations.TextAnnotation.class); }
private static TimeExpression getTimeExpression(List<? extends CoreMap> list, int index) { return list.get(index).get(TimeExpression.Annotation.class); }
/** * Reads a list of Entries from a mapping file and update the given entries. Line numbers start * from 1. * * @return the updated list of Entries */ private static List<Entry> readEntries( String annotatorName, List<Entry> entries, TrieMap<String, Entry> seenRegexes, String mappingFilename, BufferedReader mapping, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose) throws IOException { int origEntriesSize = entries.size(); int isTokensRegex = 0; int lineCount = 0; for (String line; (line = mapping.readLine()) != null; ) { lineCount++; String[] split = line.split("\t"); if (split.length < 2 || split.length > 5) { throw new IllegalArgumentException( "Provided mapping file is in wrong format. This line is bad: " + line); } String regex = split[0].trim(); String tokensRegex = null; String[] regexes = null; if (regex.startsWith("( ") && regex.endsWith(" )")) { // Tokens regex (remove start and end parenthesis) tokensRegex = regex.substring(1, regex.length() - 1).trim(); } else { regexes = regex.split("\\s+"); } String[] key = (regexes != null) ? regexes : new String[] {tokensRegex}; if (ignoreCase) { String[] norm = new String[key.length]; for (int i = 0; i < key.length; i++) { norm[i] = key[i].toLowerCase(); } key = norm; } String type = split[1].trim(); Set<String> overwritableTypes = Generics.newHashSet(); double priority = 0.0; if (split.length >= 3) { overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*"))); } if (split.length >= 4) { try { priority = Double.parseDouble(split[3].trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid priority in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } int annotateGroup = 0; // Get annotate group from input.... if (split.length >= 5) { // Which group to take (allow for context) String context = split[4].trim(); try { annotateGroup = Integer.parseInt(context); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid group in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } // Print some warning about the type int commaPos = type.indexOf(','); if (commaPos > 0) { // Strip the "," and just take first type String newType = type.substring(0, commaPos).trim(); logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry has multiple types: " + line + ". Taking type to be " + newType); type = newType; } Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup); if (seenRegexes.containsKey(key)) { Entry oldEntry = seenRegexes.get(key); if (priority > oldEntry.priority) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry); } else { if (!oldEntry.type.equals(type)) { if (verbose) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type); } // } else { // if (verbose) { // logger.warn("TokensRegexNERAnnotator " + annotatorName + // ": Duplicate entry [ignored]: " + split[0] + ", old type = " + // oldEntry.type + ", new type = " + type); // } } continue; } } // Print some warning if label belongs to noDefaultOverwriteLabels but there is no // overwritable types if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels"); } entries.add(entry); seenRegexes.put(key, entry); if (entry.tokensRegex != null) isTokensRegex++; } logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + (entries.size() - origEntriesSize) + " unique entries out of " + lineCount + " from " + mappingFilename + ", " + isTokensRegex + " TokensRegex patterns."); return entries; }
private boolean checkOrigNerTags(Entry entry, List<CoreLabel> tokens, int start, int end) { int prevNerEndIndex = start - 1; int nextNerStartIndex = end; // Check if we found a pattern that overlaps with existing ner labels // tag1 tag1 x x tag2 tag2 // tag tag tag tag // Don't overwrite the old ner label if we overlap like this String startNer = tokens.get(start).ner(); String endNer = tokens.get(end - 1).ner(); if (startNer != null && !myLabels.contains(startNer)) { while (prevNerEndIndex >= 0) { // go backwards to find different entity type String ner = tokens.get(prevNerEndIndex).ner(); if (ner == null || !ner.equals(startNer)) { break; } prevNerEndIndex--; } } if (endNer != null && !myLabels.contains(endNer)) { while (nextNerStartIndex < tokens.size()) { // go backwards to find different entity type String ner = tokens.get(nextNerStartIndex).ner(); if (ner == null || !ner.equals(endNer)) { break; } nextNerStartIndex++; } } boolean overwriteOriginalNer = false; //noinspection StatementWithEmptyBody if (prevNerEndIndex != (start - 1) || nextNerStartIndex != end) { // Cutting across already recognized NEs don't disturb } else if (startNer == null) { // No old ner, okay to replace overwriteOriginalNer = true; } else { // Check if we have one consistent NER tag // if not, overwrite // if consistent, overwrite only if in our set of ner tags that we overwrite for (int i = start + 1; i < end; i++) { if (!startNer.equals(tokens.get(i).ner())) { overwriteOriginalNer = true; break; } } if (!overwriteOriginalNer) { // check if old ner type was one that was specified as explicitly overwritable by this entry if (entry.overwritableTypes.contains(startNer)) { overwriteOriginalNer = true; } else { // if this ner type doesn't belong to the labels for which we don't overwrite the default // labels (noDefaultOverwriteLabels) // we check mylabels to see if we can overwrite this entry if ( /*entry.overwritableTypes.isEmpty() || */ !noDefaultOverwriteLabels.contains( entry.type)) { overwriteOriginalNer = myLabels.contains(startNer); } } } } return overwriteOriginalNer; }