@Override public Set<Requirement> requirementsSatisfied() { Set<Requirement> satisfied = Generics.newHashSet(); for (Annotator annotator : annotators) { satisfied.addAll(annotator.requirementsSatisfied()); } return satisfied; }
/** * Reads a list of Entries from a mapping file and update the given entries. Line numbers start * from 1. * * @return the updated list of Entries */ private static List<Entry> readEntries( String annotatorName, List<Entry> entries, TrieMap<String, Entry> seenRegexes, String mappingFilename, BufferedReader mapping, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose) throws IOException { int origEntriesSize = entries.size(); int isTokensRegex = 0; int lineCount = 0; for (String line; (line = mapping.readLine()) != null; ) { lineCount++; String[] split = line.split("\t"); if (split.length < 2 || split.length > 5) { throw new IllegalArgumentException( "Provided mapping file is in wrong format. This line is bad: " + line); } String regex = split[0].trim(); String tokensRegex = null; String[] regexes = null; if (regex.startsWith("( ") && regex.endsWith(" )")) { // Tokens regex (remove start and end parenthesis) tokensRegex = regex.substring(1, regex.length() - 1).trim(); } else { regexes = regex.split("\\s+"); } String[] key = (regexes != null) ? regexes : new String[] {tokensRegex}; if (ignoreCase) { String[] norm = new String[key.length]; for (int i = 0; i < key.length; i++) { norm[i] = key[i].toLowerCase(); } key = norm; } String type = split[1].trim(); Set<String> overwritableTypes = Generics.newHashSet(); double priority = 0.0; if (split.length >= 3) { overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*"))); } if (split.length >= 4) { try { priority = Double.parseDouble(split[3].trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid priority in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } int annotateGroup = 0; // Get annotate group from input.... if (split.length >= 5) { // Which group to take (allow for context) String context = split[4].trim(); try { annotateGroup = Integer.parseInt(context); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid group in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } // Print some warning about the type int commaPos = type.indexOf(','); if (commaPos > 0) { // Strip the "," and just take first type String newType = type.substring(0, commaPos).trim(); logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry has multiple types: " + line + ". Taking type to be " + newType); type = newType; } Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup); if (seenRegexes.containsKey(key)) { Entry oldEntry = seenRegexes.get(key); if (priority > oldEntry.priority) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry); } else { if (!oldEntry.type.equals(type)) { if (verbose) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type); } // } else { // if (verbose) { // logger.warn("TokensRegexNERAnnotator " + annotatorName + // ": Duplicate entry [ignored]: " + split[0] + ", old type = " + // oldEntry.type + ", new type = " + type); // } } continue; } } // Print some warning if label belongs to noDefaultOverwriteLabels but there is no // overwritable types if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels"); } entries.add(entry); seenRegexes.put(key, entry); if (entry.tokensRegex != null) isTokensRegex++; } logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + (entries.size() - origEntriesSize) + " unique entries out of " + lineCount + " from " + mappingFilename + ", " + isTokensRegex + " TokensRegex patterns."); return entries; }