// TODO avoid duplicate tags if there are multiple taggers for the same type @SuppressWarnings("unchecked") public static String[] tagNes(String[] sentences) { String[] results = new String[sentences.length]; for (int s = 0; s < results.length; s++) results[s] = ""; // initialize prevTokenMaps Map[] prevTokenMaps = new HashMap[finders.length]; for (int i = 0; i < finders.length; i++) prevTokenMaps[i] = new HashMap(); for (int s = 0; s < sentences.length; s++) { // tokenize sentence Span[] spans = NameFinder.tokenizeToSpans(sentences[s]); String[] tokens = tokenize(sentences[s]); // find named entities String[][] finderTags = new String[finders.length][]; for (int i = 0; i < finders.length; i++) finderTags[i] = finders[i].find(tokens, prevTokenMaps[i]); // update prevTokenMaps for (int i = 0; i < prevTokenMaps.length; i++) for (int j = 0; j < tokens.length; j++) prevTokenMaps[i].put(tokens[j], finderTags[i][j]); // apply regular expressions String[][] regExTags = new String[patterns.length + 1 + quantityUnitPatterns.length][]; // don't tag NEproperName here regExTags[0] = new String[tokens.length]; for (int i = 0; i < tokens.length; i++) regExTags[0][i] = NameFinderME.OTHER; for (int i = 1; i < patterns.length; i++) regExTags[i] = RegExMatcher.markAllMatches(tokens, patterns[i], patternMaxTokens[i]); String[] numberMarkers = RegExMatcher.extractNumbers(tokens); regExTags[patterns.length] = numberMarkers; for (int i = 0; i < quantityUnitPatterns.length; i++) regExTags[patterns.length + i + 1] = RegExMatcher.extractQuantities( tokens, numberMarkers, quantityUnitPatterns[i], quantityUnitPatternMaxTokens[i]); // apply lists String[][] listTags = new String[lists.length][]; for (int i = 0; i < lists.length; i++) listTags[i] = RegExMatcher.markAllContained( tokens, RegExMatcher.getDictionary(lists[i]), fuzzyListLookupThreshold); for (int i = 0; i < tokens.length; i++) { // check for end tags for (int j = 0; j < finders.length; j++) if (i != 0) if ((finderTags[j][i].equals(NameFinderME.START) || finderTags[j][i].equals(NameFinderME.OTHER)) && (finderTags[j][i - 1].equals(NameFinderME.START) || finderTags[j][i - 1].equals(NameFinderME.CONTINUE))) results[s] += "</" + finderNames[j] + ">"; // check for end tags for (int j = 0; j < allPatternNames.length; j++) if (i != 0) if ((regExTags[j][i].equals(NameFinderME.START) || regExTags[j][i].equals(NameFinderME.OTHER)) && (regExTags[j][i - 1].equals(NameFinderME.START) || regExTags[j][i - 1].equals(NameFinderME.CONTINUE))) results[s] += "</" + allPatternNames[j] + ">"; // check for end tags for (int j = 0; j < listNames.length; j++) if (i != 0) if ((regExTags[j][i].equals(NameFinderME.START) || regExTags[j][i].equals(NameFinderME.OTHER)) && (regExTags[j][i - 1].equals(NameFinderME.START) || regExTags[j][i - 1].equals(NameFinderME.CONTINUE))) results[s] += "</" + listNames[j] + ">"; if (i > 0 && spans[i - 1].getEnd() < spans[i].getStart()) results[s] += sentences[s].substring(spans[i - 1].getEnd(), spans[i].getStart()); // check for start tags for (int j = 0; j < finders.length; j++) if (finderTags[j][i].equals(NameFinderME.START)) results[s] += "<" + finderNames[j] + ">"; // check for start tags for (int j = 0; j < allPatternNames.length; j++) if (regExTags[j][i].equals(NameFinderME.START)) results[s] += "<" + allPatternNames[j] + ">"; // check for start tags for (int j = 0; j < listNames.length; j++) if (regExTags[j][i].equals(NameFinderME.START)) results[s] += "<" + listNames[j] + ">"; results[s] += tokens[i]; } if (tokens.length != 0) { int last = tokens.length - 1; // final end tags for (int i = 0; i < finders.length; i++) if (finderTags[i][last].equals(NameFinderME.START) || finderTags[i][last].equals(NameFinderME.CONTINUE)) results[s] += "</" + finderNames[i] + ">"; // final end tags for (int i = 0; i < allPatternNames.length; i++) if (regExTags[i][last].equals(NameFinderME.START) || regExTags[i][last].equals(NameFinderME.CONTINUE)) results[s] += "</" + allPatternNames[i] + ">"; // final end tags for (int i = 0; i < listNames.length; i++) if (regExTags[i][last].equals(NameFinderME.START) || regExTags[i][last].equals(NameFinderME.CONTINUE)) results[s] += "</" + listNames[i] + ">"; if (spans[last].getEnd() < sentences[s].length()) results[s] += sentences[s].substring(spans[last].getEnd()); } } return results; }
/** * A rule-based tokenizer used to prepare a sentence for NE extraction. * * @param text text to tokenize * @return array of tokens */ public static String[] tokenize(String text) { Span[] spans = NameFinder.tokenizeToSpans(text); return NameFinder.spansToStrings(spans, text); }