Пример #1
0
  // TODO avoid duplicate tags if there are multiple taggers for the same type
  @SuppressWarnings("unchecked")
  public static String[] tagNes(String[] sentences) {
    String[] results = new String[sentences.length];
    for (int s = 0; s < results.length; s++) results[s] = "";

    // initialize prevTokenMaps
    Map[] prevTokenMaps = new HashMap[finders.length];
    for (int i = 0; i < finders.length; i++) prevTokenMaps[i] = new HashMap();

    for (int s = 0; s < sentences.length; s++) {
      // tokenize sentence
      Span[] spans = NameFinder.tokenizeToSpans(sentences[s]);
      String[] tokens = tokenize(sentences[s]);

      // find named entities
      String[][] finderTags = new String[finders.length][];
      for (int i = 0; i < finders.length; i++)
        finderTags[i] = finders[i].find(tokens, prevTokenMaps[i]);

      // update prevTokenMaps
      for (int i = 0; i < prevTokenMaps.length; i++)
        for (int j = 0; j < tokens.length; j++) prevTokenMaps[i].put(tokens[j], finderTags[i][j]);

      // apply regular expressions
      String[][] regExTags = new String[patterns.length + 1 + quantityUnitPatterns.length][];

      //	don't tag NEproperName here
      regExTags[0] = new String[tokens.length];
      for (int i = 0; i < tokens.length; i++) regExTags[0][i] = NameFinderME.OTHER;

      for (int i = 1; i < patterns.length; i++)
        regExTags[i] = RegExMatcher.markAllMatches(tokens, patterns[i], patternMaxTokens[i]);

      String[] numberMarkers = RegExMatcher.extractNumbers(tokens);
      regExTags[patterns.length] = numberMarkers;

      for (int i = 0; i < quantityUnitPatterns.length; i++)
        regExTags[patterns.length + i + 1] =
            RegExMatcher.extractQuantities(
                tokens, numberMarkers, quantityUnitPatterns[i], quantityUnitPatternMaxTokens[i]);

      //	apply lists
      String[][] listTags = new String[lists.length][];
      for (int i = 0; i < lists.length; i++)
        listTags[i] =
            RegExMatcher.markAllContained(
                tokens, RegExMatcher.getDictionary(lists[i]), fuzzyListLookupThreshold);

      for (int i = 0; i < tokens.length; i++) {
        // check for end tags
        for (int j = 0; j < finders.length; j++)
          if (i != 0)
            if ((finderTags[j][i].equals(NameFinderME.START)
                    || finderTags[j][i].equals(NameFinderME.OTHER))
                && (finderTags[j][i - 1].equals(NameFinderME.START)
                    || finderTags[j][i - 1].equals(NameFinderME.CONTINUE)))
              results[s] += "</" + finderNames[j] + ">";

        // check for end tags
        for (int j = 0; j < allPatternNames.length; j++)
          if (i != 0)
            if ((regExTags[j][i].equals(NameFinderME.START)
                    || regExTags[j][i].equals(NameFinderME.OTHER))
                && (regExTags[j][i - 1].equals(NameFinderME.START)
                    || regExTags[j][i - 1].equals(NameFinderME.CONTINUE)))
              results[s] += "</" + allPatternNames[j] + ">";

        // check for end tags
        for (int j = 0; j < listNames.length; j++)
          if (i != 0)
            if ((regExTags[j][i].equals(NameFinderME.START)
                    || regExTags[j][i].equals(NameFinderME.OTHER))
                && (regExTags[j][i - 1].equals(NameFinderME.START)
                    || regExTags[j][i - 1].equals(NameFinderME.CONTINUE)))
              results[s] += "</" + listNames[j] + ">";

        if (i > 0 && spans[i - 1].getEnd() < spans[i].getStart())
          results[s] += sentences[s].substring(spans[i - 1].getEnd(), spans[i].getStart());

        // check for start tags
        for (int j = 0; j < finders.length; j++)
          if (finderTags[j][i].equals(NameFinderME.START)) results[s] += "<" + finderNames[j] + ">";

        // check for start tags
        for (int j = 0; j < allPatternNames.length; j++)
          if (regExTags[j][i].equals(NameFinderME.START))
            results[s] += "<" + allPatternNames[j] + ">";

        // check for start tags
        for (int j = 0; j < listNames.length; j++)
          if (regExTags[j][i].equals(NameFinderME.START)) results[s] += "<" + listNames[j] + ">";

        results[s] += tokens[i];
      }

      if (tokens.length != 0) {
        int last = tokens.length - 1;

        // final end tags
        for (int i = 0; i < finders.length; i++)
          if (finderTags[i][last].equals(NameFinderME.START)
              || finderTags[i][last].equals(NameFinderME.CONTINUE))
            results[s] += "</" + finderNames[i] + ">";

        // final end tags
        for (int i = 0; i < allPatternNames.length; i++)
          if (regExTags[i][last].equals(NameFinderME.START)
              || regExTags[i][last].equals(NameFinderME.CONTINUE))
            results[s] += "</" + allPatternNames[i] + ">";

        // final end tags
        for (int i = 0; i < listNames.length; i++)
          if (regExTags[i][last].equals(NameFinderME.START)
              || regExTags[i][last].equals(NameFinderME.CONTINUE))
            results[s] += "</" + listNames[i] + ">";

        if (spans[last].getEnd() < sentences[s].length())
          results[s] += sentences[s].substring(spans[last].getEnd());
      }
    }

    return results;
  }
Пример #2
0
 /**
  * A rule-based tokenizer used to prepare a sentence for NE extraction.
  *
  * @param text text to tokenize
  * @return array of tokens
  */
 public static String[] tokenize(String text) {
   Span[] spans = NameFinder.tokenizeToSpans(text);
   return NameFinder.spansToStrings(spans, text);
 }