Exemplo n.º 1
0
 public Set<K2> secondKeySet() {
   Set<K2> keys = Generics.newHashSet();
   for (K1 k1 : map.keySet()) {
     keys.addAll(get(k1).firstKeySet());
   }
   return keys;
 }
Exemplo n.º 2
0
 @Override
 public Set<Requirement> requirementsSatisfied() {
   Set<Requirement> satisfied = Generics.newHashSet();
   for (Annotator annotator : annotators) {
     satisfied.addAll(annotator.requirementsSatisfied());
   }
   return satisfied;
 }
Exemplo n.º 3
0
 public Set<K3> thirdKeySet() {
   Set<K3> keys = Generics.newHashSet();
   for (K1 k1 : map.keySet()) {
     TwoDimensionalMap<K2, K3, V> m = map.get(k1);
     for (K2 k2 : m.firstKeySet()) {
       keys.addAll(m.get(k2).keySet());
     }
   }
   return keys;
 }
  public TokensRegexNERAnnotator(String name, Properties properties) {
    String prefix = (name != null && !name.isEmpty()) ? name + '.' : "";
    String backgroundSymbol =
        properties.getProperty(prefix + "backgroundSymbol", DEFAULT_BACKGROUND_SYMBOL);
    String[] backgroundSymbols = backgroundSymbol.split("\\s*,\\s*");
    String mappingFiles =
        properties.getProperty(prefix + "mapping", DefaultPaths.DEFAULT_REGEXNER_RULES);
    String[] mappings = mappingFiles.split("\\s*[,;]\\s*");
    String validPosRegex = properties.getProperty(prefix + "validpospattern");
    this.posMatchType =
        PosMatchType.valueOf(
            properties.getProperty(prefix + "posmatchtype", DEFAULT_POS_MATCH_TYPE.name()));

    String noDefaultOverwriteLabelsProp =
        properties.getProperty(prefix + "noDefaultOverwriteLabels");
    this.noDefaultOverwriteLabels =
        (noDefaultOverwriteLabelsProp != null)
            ? Collections.unmodifiableSet(
                CollectionUtils.asSet(noDefaultOverwriteLabelsProp.split("\\s*,\\s*")))
            : Collections.unmodifiableSet(new HashSet<>());
    this.ignoreCase = PropertiesUtils.getBool(properties, prefix + "ignorecase", false);
    this.verbose = PropertiesUtils.getBool(properties, prefix + "verbose", false);

    if (validPosRegex != null && !validPosRegex.isEmpty()) {
      validPosPattern = Pattern.compile(validPosRegex);
    } else {
      validPosPattern = null;
    }
    entries =
        Collections.unmodifiableList(
            readEntries(name, noDefaultOverwriteLabels, ignoreCase, verbose, mappings));
    IdentityHashMap<SequencePattern<CoreMap>, Entry> patternToEntry = new IdentityHashMap<>();
    multiPatternMatcher = createPatternMatcher(patternToEntry);
    this.patternToEntry = Collections.unmodifiableMap(patternToEntry);
    Set<String> myLabels = Generics.newHashSet();
    // Can always override background or none.
    Collections.addAll(myLabels, backgroundSymbols);
    myLabels.add(null);
    // Always overwrite labels
    for (Entry entry : entries) myLabels.add(entry.type);
    this.myLabels = Collections.unmodifiableSet(myLabels);
  }
  /**
   * Reads a list of Entries from a mapping file and update the given entries. Line numbers start
   * from 1.
   *
   * @return the updated list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      List<Entry> entries,
      TrieMap<String, Entry> seenRegexes,
      String mappingFilename,
      BufferedReader mapping,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose)
      throws IOException {
    int origEntriesSize = entries.size();
    int isTokensRegex = 0;
    int lineCount = 0;
    for (String line; (line = mapping.readLine()) != null; ) {
      lineCount++;
      String[] split = line.split("\t");
      if (split.length < 2 || split.length > 5) {
        throw new IllegalArgumentException(
            "Provided mapping file is in wrong format. This line is bad: " + line);
      }
      String regex = split[0].trim();
      String tokensRegex = null;
      String[] regexes = null;
      if (regex.startsWith("( ") && regex.endsWith(" )")) {
        // Tokens regex (remove start and end parenthesis)
        tokensRegex = regex.substring(1, regex.length() - 1).trim();
      } else {
        regexes = regex.split("\\s+");
      }
      String[] key = (regexes != null) ? regexes : new String[] {tokensRegex};
      if (ignoreCase) {
        String[] norm = new String[key.length];
        for (int i = 0; i < key.length; i++) {
          norm[i] = key[i].toLowerCase();
        }
        key = norm;
      }
      String type = split[1].trim();

      Set<String> overwritableTypes = Generics.newHashSet();
      double priority = 0.0;

      if (split.length >= 3) {
        overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*")));
      }
      if (split.length >= 4) {
        try {
          priority = Double.parseDouble(split[3].trim());
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid priority in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }
      int annotateGroup = 0;
      // Get annotate group from input....
      if (split.length >= 5) {
        // Which group to take (allow for context)
        String context = split[4].trim();
        try {
          annotateGroup = Integer.parseInt(context);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid group in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }

      // Print some warning about the type
      int commaPos = type.indexOf(',');
      if (commaPos > 0) {
        // Strip the "," and just take first type
        String newType = type.substring(0, commaPos).trim();
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry has multiple types: "
                + line
                + ".  Taking type to be "
                + newType);
        type = newType;
      }

      Entry entry =
          new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup);

      if (seenRegexes.containsKey(key)) {
        Entry oldEntry = seenRegexes.get(key);
        if (priority > oldEntry.priority) {
          logger.warn(
              "TokensRegexNERAnnotator "
                  + annotatorName
                  + ": Replace duplicate entry (higher priority): old="
                  + oldEntry
                  + ", new="
                  + entry);
        } else {
          if (!oldEntry.type.equals(type)) {
            if (verbose) {
              logger.warn(
                  "TokensRegexNERAnnotator "
                      + annotatorName
                      + ": Ignoring duplicate entry: "
                      + split[0]
                      + ", old type = "
                      + oldEntry.type
                      + ", new type = "
                      + type);
            }
            // } else {
            //   if (verbose) {
            //     logger.warn("TokensRegexNERAnnotator " + annotatorName +
            //             ": Duplicate entry [ignored]: " + split[0] + ", old type = " +
            // oldEntry.type + ", new type = " + type);
            //   }
          }
          continue;
        }
      }

      // Print some warning if label belongs to noDefaultOverwriteLabels but there is no
      // overwritable types
      if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) {
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry doesn't have overwriteable types "
                + entry
                + ", but entry type is in noDefaultOverwriteLabels");
      }

      entries.add(entry);
      seenRegexes.put(key, entry);
      if (entry.tokensRegex != null) isTokensRegex++;
    }

    logger.log(
        "TokensRegexNERAnnotator "
            + annotatorName
            + ": Read "
            + (entries.size() - origEntriesSize)
            + " unique entries out of "
            + lineCount
            + " from "
            + mappingFilename
            + ", "
            + isTokensRegex
            + " TokensRegex patterns.");
    return entries;
  }