Ejemplo n.º 1
0
 // TODO: roll check into tokens regex pattern?
 // That allows for better matching because unmatched sequences will be eliminated at match time
 private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) {
   if (validPosPattern != null) {
     // Need to check POS tag too...
     switch (posMatchType) {
       case MATCH_ONE_TOKEN_PHRASE_ONLY:
         if (tokens.size() > 1) return true;
         // fall through
       case MATCH_AT_LEAST_ONE_TOKEN:
         for (int i = start; i < end; i++) {
           CoreLabel token = tokens.get(i);
           String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
           if (pos != null && validPosPattern.matcher(pos).matches()) {
             return true;
           }
         }
         return false;
       case MATCH_ALL_TOKENS:
         // Checked else where
         return true;
       default:
         // Don't know this match type....
         return true;
     }
   }
   return true;
 }
 private List<CoreMap> toCoreMaps(
     CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) {
   if (timeExpressions == null) return null;
   List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size());
   for (TimeExpression te : timeExpressions) {
     CoreMap cm = te.getAnnotation();
     SUTime.Temporal temporal = te.getTemporal();
     if (temporal != null) {
       String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
       String text = cm.get(CoreAnnotations.TextAnnotation.class);
       if (origText != null) {
         // Make sure the text is from original (and not from concatenated tokens)
         ChunkAnnotationUtils.annotateChunkText(cm, annotation);
         text = cm.get(CoreAnnotations.TextAnnotation.class);
       }
       Map<String, String> timexAttributes;
       try {
         timexAttributes = temporal.getTimexAttributes(timeIndex);
         if (options.includeRange) {
           SUTime.Temporal rangeTemporal = temporal.getRange();
           if (rangeTemporal != null) {
             timexAttributes.put("range", rangeTemporal.toString());
           }
         }
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to get attributes from " + text + ", timeIndex " + timeIndex,
             e);
         continue;
       }
       Timex timex;
       try {
         timex = Timex.fromMap(text, timexAttributes);
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to process " + text + " with attributes " + timexAttributes,
             e);
         continue;
       }
       cm.set(TimexAnnotation.class, timex);
       if (timex != null) {
         coreMaps.add(cm);
       } else {
         logger.warning("No timex expression for: " + text);
       }
     }
   }
   return coreMaps;
 }
 public SUTime.Temporal apply(MatchResult in) {
   if (in instanceof SequenceMatchResult) {
     SequenceMatchResult<CoreMap> mr = (SequenceMatchResult<CoreMap>) (in);
     if (group >= 0) {
       List<? extends CoreMap> matched = mr.groupNodes(group);
       if (matched != null) {
         int i = (nodeIndex >= 0) ? 0 : (matched.size() + nodeIndex);
         TimeExpression te = getTimeExpression(matched, i);
         if (te != null) {
           return te.getTemporal();
         }
       }
     }
   }
   return null;
 }
 private List<TimeExpression> filterInvalidTimeExpressions(List<TimeExpression> timeExprs) {
   int nfiltered = 0;
   List<TimeExpression> filtered =
       new ArrayList<TimeExpression>(timeExprs.size()); // Approximate size
   for (TimeExpression timeExpr : timeExprs) {
     if (timexPatterns.checkTimeExpression(timeExpr)) {
       filtered.add(timeExpr);
     } else {
       nfiltered++;
     }
   }
   if (nfiltered > 0) {
     logger.finest("Filtered " + nfiltered);
   }
   return filtered;
 }
Ejemplo n.º 5
0
  private void annotateMatched(List<CoreLabel> tokens) {
    List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens);
    for (SequenceMatchResult<CoreMap> m : matched) {
      Entry entry = patternToEntry.get(m.pattern());

      // Check if we will overwrite the existing annotation with this annotation
      int g = entry.annotateGroup;
      int start = m.start(g);
      int end = m.end(g);

      boolean overwriteOriginalNer = checkPosTags(tokens, start, end);
      if (overwriteOriginalNer) {
        overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end);
      }
      if (overwriteOriginalNer) {
        for (int i = start; i < end; i++) {
          tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type);
        }
      } else {
        if (verbose) {
          System.err.println(
              "Not annotating  '"
                  + m.group(g)
                  + "': "
                  + StringUtils.joinFields(
                      m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class)
                  + " with "
                  + entry.type
                  + ", sentence is '"
                  + StringUtils.joinWords(tokens, " ")
                  + '\'');
        }
      }
    }
  }
Ejemplo n.º 6
0
  /**
   * Creates a combined list of Entries using the provided mapping files.
   *
   * @param mappings List of mapping files
   * @return list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose,
      String... mappings) {
    // Unlike RegexNERClassifier, we don't bother sorting the entries
    // We leave it to TokensRegex NER to sort out the priorities and matches
    //   (typically after all the matches has been made since for some TokenRegex expression,
    //       we don't know how many tokens are matched until after the matching is done)
    List<Entry> entries = new ArrayList<>();
    TrieMap<String, Entry> seenRegexes = new TrieMap<>();
    Arrays.sort(mappings);
    for (String mapping : mappings) {
      BufferedReader rd = null;
      try {
        rd = IOUtils.readerFromString(mapping);
        readEntries(
            annotatorName,
            entries,
            seenRegexes,
            mapping,
            rd,
            noDefaultOverwriteLabels,
            ignoreCase,
            verbose);
      } catch (IOException e) {
        throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e);
      } finally {
        IOUtils.closeIgnoringExceptions(rd);
      }
    }

    if (mappings.length != 1) {
      logger.log(
          "TokensRegexNERAnnotator "
              + annotatorName
              + ": Read "
              + entries.size()
              + " unique entries from "
              + mappings.length
              + " files");
    }
    return entries;
  }
  public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) {
    List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation);
    annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers);

    // TODO: docDate may not have century....
    SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr);

    List<? extends MatchedExpression> matchedExpressions =
        expressionExtractor.extractExpressions(annotation);
    List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size());
    for (MatchedExpression expr : matchedExpressions) {
      if (expr instanceof TimeExpression) {
        timeExpressions.add((TimeExpression) expr);
      } else {
        timeExpressions.add(new TimeExpression(expr));
      }
    }

    // Add back nested time expressions for ranges....
    // For now only one level of nesting...
    if (options.includeNested) {
      List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>();
      for (TimeExpression te : timeExpressions) {
        if (te.isIncludeNested()) {
          List<? extends CoreMap> children =
              te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class);
          if (children != null) {
            for (CoreMap child : children) {
              TimeExpression childTe = child.get(TimeExpression.Annotation.class);
              if (childTe != null) {
                nestedTimeExpressions.add(childTe);
              }
            }
          }
        }
      }
      timeExpressions.addAll(nestedTimeExpressions);
    }
    Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
    timeExpressions = filterInvalidTimeExpressions(timeExpressions);

    // Some resolving is done even if docDate null...
    if (
    /*docDate != null && */ timeExpressions != null) {
      resolveTimeExpressions(annotation, timeExpressions, docDate);
    }
    // Annotate timex
    return timeExpressions;
  }
Ejemplo n.º 8
0
 private MultiPatternMatcher<CoreMap> createPatternMatcher(
     Map<SequencePattern<CoreMap>, Entry> patternToEntry) {
   // Convert to tokensregex pattern
   int patternFlags = ignoreCase ? Pattern.CASE_INSENSITIVE : 0;
   int stringMatchFlags = ignoreCase ? NodePattern.CASE_INSENSITIVE : 0;
   Env env = TokenSequencePattern.getNewEnv();
   env.setDefaultStringPatternFlags(patternFlags);
   env.setDefaultStringMatchFlags(stringMatchFlags);
   NodePattern<String> posTagPattern =
       (validPosPattern != null && PosMatchType.MATCH_ALL_TOKENS.equals(posMatchType))
           ? new CoreMapNodePattern.StringAnnotationRegexPattern(validPosPattern)
           : null;
   List<TokenSequencePattern> patterns = new ArrayList<>(entries.size());
   for (Entry entry : entries) {
     TokenSequencePattern pattern;
     if (entry.tokensRegex != null) {
       // TODO: posTagPatterns...
       pattern = TokenSequencePattern.compile(env, entry.tokensRegex);
     } else {
       List<SequencePattern.PatternExpr> nodePatterns = new ArrayList<>();
       for (String p : entry.regex) {
         CoreMapNodePattern c = CoreMapNodePattern.valueOf(p, patternFlags);
         if (posTagPattern != null) {
           c.add(CoreAnnotations.PartOfSpeechAnnotation.class, posTagPattern);
         }
         nodePatterns.add(new SequencePattern.NodePatternExpr(c));
       }
       pattern =
           TokenSequencePattern.compile(new SequencePattern.SequencePatternExpr(nodePatterns));
     }
     if (entry.annotateGroup < 0 || entry.annotateGroup > pattern.getTotalGroups()) {
       throw new RuntimeException("Invalid match group for entry " + entry);
     }
     pattern.setPriority(entry.priority);
     patterns.add(pattern);
     patternToEntry.put(pattern, entry);
   }
   return TokenSequencePattern.getMultiPatternMatcher(patterns);
 }
 @SuppressWarnings("unused")
 private static String getText(List<? extends CoreMap> list, int index) {
   return list.get(index).get(CoreAnnotations.TextAnnotation.class);
 }
 private static TimeExpression getTimeExpression(List<? extends CoreMap> list, int index) {
   return list.get(index).get(TimeExpression.Annotation.class);
 }
Ejemplo n.º 11
0
  /**
   * Reads a list of Entries from a mapping file and update the given entries. Line numbers start
   * from 1.
   *
   * @return the updated list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      List<Entry> entries,
      TrieMap<String, Entry> seenRegexes,
      String mappingFilename,
      BufferedReader mapping,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose)
      throws IOException {
    int origEntriesSize = entries.size();
    int isTokensRegex = 0;
    int lineCount = 0;
    for (String line; (line = mapping.readLine()) != null; ) {
      lineCount++;
      String[] split = line.split("\t");
      if (split.length < 2 || split.length > 5) {
        throw new IllegalArgumentException(
            "Provided mapping file is in wrong format. This line is bad: " + line);
      }
      String regex = split[0].trim();
      String tokensRegex = null;
      String[] regexes = null;
      if (regex.startsWith("( ") && regex.endsWith(" )")) {
        // Tokens regex (remove start and end parenthesis)
        tokensRegex = regex.substring(1, regex.length() - 1).trim();
      } else {
        regexes = regex.split("\\s+");
      }
      String[] key = (regexes != null) ? regexes : new String[] {tokensRegex};
      if (ignoreCase) {
        String[] norm = new String[key.length];
        for (int i = 0; i < key.length; i++) {
          norm[i] = key[i].toLowerCase();
        }
        key = norm;
      }
      String type = split[1].trim();

      Set<String> overwritableTypes = Generics.newHashSet();
      double priority = 0.0;

      if (split.length >= 3) {
        overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*")));
      }
      if (split.length >= 4) {
        try {
          priority = Double.parseDouble(split[3].trim());
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid priority in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }
      int annotateGroup = 0;
      // Get annotate group from input....
      if (split.length >= 5) {
        // Which group to take (allow for context)
        String context = split[4].trim();
        try {
          annotateGroup = Integer.parseInt(context);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid group in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }

      // Print some warning about the type
      int commaPos = type.indexOf(',');
      if (commaPos > 0) {
        // Strip the "," and just take first type
        String newType = type.substring(0, commaPos).trim();
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry has multiple types: "
                + line
                + ".  Taking type to be "
                + newType);
        type = newType;
      }

      Entry entry =
          new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup);

      if (seenRegexes.containsKey(key)) {
        Entry oldEntry = seenRegexes.get(key);
        if (priority > oldEntry.priority) {
          logger.warn(
              "TokensRegexNERAnnotator "
                  + annotatorName
                  + ": Replace duplicate entry (higher priority): old="
                  + oldEntry
                  + ", new="
                  + entry);
        } else {
          if (!oldEntry.type.equals(type)) {
            if (verbose) {
              logger.warn(
                  "TokensRegexNERAnnotator "
                      + annotatorName
                      + ": Ignoring duplicate entry: "
                      + split[0]
                      + ", old type = "
                      + oldEntry.type
                      + ", new type = "
                      + type);
            }
            // } else {
            //   if (verbose) {
            //     logger.warn("TokensRegexNERAnnotator " + annotatorName +
            //             ": Duplicate entry [ignored]: " + split[0] + ", old type = " +
            // oldEntry.type + ", new type = " + type);
            //   }
          }
          continue;
        }
      }

      // Print some warning if label belongs to noDefaultOverwriteLabels but there is no
      // overwritable types
      if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) {
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry doesn't have overwriteable types "
                + entry
                + ", but entry type is in noDefaultOverwriteLabels");
      }

      entries.add(entry);
      seenRegexes.put(key, entry);
      if (entry.tokensRegex != null) isTokensRegex++;
    }

    logger.log(
        "TokensRegexNERAnnotator "
            + annotatorName
            + ": Read "
            + (entries.size() - origEntriesSize)
            + " unique entries out of "
            + lineCount
            + " from "
            + mappingFilename
            + ", "
            + isTokensRegex
            + " TokensRegex patterns.");
    return entries;
  }
Ejemplo n.º 12
0
  private boolean checkOrigNerTags(Entry entry, List<CoreLabel> tokens, int start, int end) {
    int prevNerEndIndex = start - 1;
    int nextNerStartIndex = end;

    // Check if we found a pattern that overlaps with existing ner labels
    // tag1 tag1 x   x  tag2 tag2
    //      tag tag tag tag
    // Don't overwrite the old ner label if we overlap like this
    String startNer = tokens.get(start).ner();
    String endNer = tokens.get(end - 1).ner();
    if (startNer != null && !myLabels.contains(startNer)) {
      while (prevNerEndIndex >= 0) {
        // go backwards to find different entity type
        String ner = tokens.get(prevNerEndIndex).ner();
        if (ner == null || !ner.equals(startNer)) {
          break;
        }
        prevNerEndIndex--;
      }
    }
    if (endNer != null && !myLabels.contains(endNer)) {
      while (nextNerStartIndex < tokens.size()) {
        // go backwards to find different entity type
        String ner = tokens.get(nextNerStartIndex).ner();
        if (ner == null || !ner.equals(endNer)) {
          break;
        }
        nextNerStartIndex++;
      }
    }
    boolean overwriteOriginalNer = false;
    //noinspection StatementWithEmptyBody
    if (prevNerEndIndex != (start - 1) || nextNerStartIndex != end) {
      // Cutting across already recognized NEs don't disturb
    } else if (startNer == null) {
      // No old ner, okay to replace
      overwriteOriginalNer = true;
    } else {
      // Check if we have one consistent NER tag
      // if not, overwrite
      // if consistent, overwrite only if in our set of ner tags that we overwrite
      for (int i = start + 1; i < end; i++) {
        if (!startNer.equals(tokens.get(i).ner())) {
          overwriteOriginalNer = true;
          break;
        }
      }
      if (!overwriteOriginalNer) {
        // check if old ner type was one that was specified as explicitly overwritable by this entry
        if (entry.overwritableTypes.contains(startNer)) {
          overwriteOriginalNer = true;
        } else {
          // if this ner type doesn't belong to the labels for which we don't overwrite the default
          // labels (noDefaultOverwriteLabels)
          // we check mylabels to see if we can overwrite this entry
          if (
          /*entry.overwritableTypes.isEmpty() || */ !noDefaultOverwriteLabels.contains(
              entry.type)) {
            overwriteOriginalNer = myLabels.contains(startNer);
          }
        }
      }
    }
    return overwriteOriginalNer;
  }