public static List<Token> groupTokens(
      String text, List<Token> toks, List<? extends Span> spans, String additionalContext) {
    for (int i = spans.size() - 1; i >= 0; i--) {
      Span span = spans.get(i);
      if (span.length() > 0) {
        int s = toks.get(span.getStart()).getSpan().getStart();
        int e = toks.get(span.getEnd() - 1).getSpan().getEnd();
        StringBuilder lexeme = new StringBuilder();
        for (int j = span.getStart(); j < span.getEnd() - 1; j++) {
          lexeme.append(toks.get(j).getLexeme()).append("_");
        }
        lexeme.append(toks.get(span.getEnd() - 1).getLexeme());

        for (int j = span.getEnd() - 1; j >= span.getStart(); j--) {
          toks.remove(j);
        }
        Token t = new TokenCogroo(lexeme.toString(), new Span(s, e));
        if (span instanceof TypedSpan) {
          t.setOriginalPOSTag(((TypedSpan) span).getType());
        }

        if (additionalContext != null) {
          t.setAdditionalContext(additionalContext);
        }

        toks.add(span.getStart(), t);
      }
    }
    return toks;
  }
  public static List<Token> groupTokensChar(
      String text, List<Token> toks, List<Span> charSpans, String additionalContext) {
    if (charSpans == null || charSpans.size() == 0) {
      return toks;
    }

    int lastVisitedTok = 0;
    List<Span> spans = new ArrayList<Span>(charSpans.size());

    for (Span ch : charSpans) {
      //      System.out.println("looking for: " + ch.getCoveredText(text));
      Token aToken = toks.get(lastVisitedTok);
      while (aToken.getSpan().getStart() < ch.getStart()) {
        lastVisitedTok++;
        aToken = toks.get(lastVisitedTok);
      }
      int start = lastVisitedTok;
      while (aToken.getSpan().getEnd() < ch.getEnd()) {
        lastVisitedTok++;
        aToken = toks.get(lastVisitedTok);
      }
      int end = lastVisitedTok + 1;
      Span tokSpan = new Span(start, end);
      spans.add(tokSpan);
    }

    return groupTokens(text, toks, spans, additionalContext);
  }
 private List<Section> detectSentences(AnalysedText at, String language) {
   SentenceDetector sentenceDetector = getSentenceDetector(language);
   List<Section> sentences;
   if (sentenceDetector != null) {
     sentences = new ArrayList<Section>();
     for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
       Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
       log.trace(" > add {}", sentence);
       sentences.add(sentence);
     }
   } else {
     sentences = null;
   }
   return sentences;
 }
  /**
   * Extracts location names from unstructured text using the named entity recognizer (NER) feature
   * provided by the Apache OpenNLP Name Finder.
   *
   * @param plainText Contents of text document
   * @return List of location names and positions
   */
  public List<LocationOccurrence> extractLocationNames(String plainText) {
    if (plainText == null) {
      throw new IllegalArgumentException(
          "plaintext input to extractLocationNames should not be null");
    }

    List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>();

    // The values used in these Spans are string character offsets
    Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText);

    // Each sentence gets processed on its own
    for (Span sentenceSpan : sentenceSpans) {

      // find the start and end position of this sentence in the document
      String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd());

      // tokenize the text into the required OpenNLP format
      String[] tokens = tokenizer.tokenize(sentence);

      // the values used in these Spans are string character offsets of each token from the sentence
      // beginning
      Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence);

      // find the location names in the tokenized text
      // the values used in these Spans are NOT string character offsets, they are indices into the
      // 'tokens' array
      Span names[] = nameFinder.find(tokens);

      // for each name that got found, create our corresponding occurrence
      for (Span name : names) {

        // find offsets relative to the start of the sentence
        int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart();
        int endOfLastWord =
            tokenPositionsWithinSentence[name.getEnd() - 1]
                .getEnd(); // -1 because the high end of a Span is noninclusiv

        // to get offsets relative to the document as a whole, just add the offset for the sentence
        // itself
        int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord;
        int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord;

        // look back into the original input string to figure out what the text is that I got a hit
        // on
        String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc);

        // add to List of results to return
        nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc));
      }
    }

    // this is necessary to maintain consistent results across
    // multiple runs on the same data, which is what we want
    nameFinder.clearAdaptiveData();

    return nerResults;
  }
  public void execute() throws ExecutionException {
    AnnotationSet outputAS = document.getAnnotations(annotationSetName);

    String text = document.getContent().toString();

    Span[] tokens = tokenizer.getTokens(text);
    try {
      for (Span token : tokens) {
        FeatureMap features = Factory.newFeatureMap();
        features.put(
            ANNIEConstants.TOKEN_STRING_FEATURE_NAME,
            text.substring(token.getStart(), token.getEnd()));

        outputAS.add(
            (long) token.getStart(),
            (long) token.getEnd(),
            ANNIEConstants.TOKEN_ANNOTATION_TYPE,
            features);
      }
    } catch (Exception e) {
      throw new ExecutionException("error running tokenizer", e);
    }
  }
  /**
   * Adds named entity information to parses.
   *
   * @param tag named entity type
   * @param names spans of tokens that are named entities
   * @param tokens parses for the tokens
   */
  private static void addNames(String tag, List names, Parse[] tokens) {
    for (int i = 0; i < names.size(); i++) {
      Span nameTokenSpan = (Span) names.get(i);
      Parse startToken = tokens[nameTokenSpan.getStart()];
      Parse endToken = tokens[nameTokenSpan.getEnd()];
      Parse commonP = startToken.getCommonParent(endToken);

      if (commonP != null) {
        Span nameSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd());

        if (nameSpan.equals(commonP.getSpan())) {
          // common parent matches exactly the named entity
          commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0));
        } else {
          // common parent includes the named entity
          Parse[] kids = commonP.getChildren();
          boolean crossingKids = false;

          for (int j = 0; j < kids.length; j++)
            if (nameSpan.crosses(kids[j].getSpan())) crossingKids = true;

          if (!crossingKids) {
            // named entity does not cross children
            commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0));
          } else {
            // NE crosses children
            if (commonP.getType().equals("NP")) {
              Parse[] grandKids = kids[0].getChildren();

              Parse last = grandKids[grandKids.length - 1];
              if (grandKids.length > 1 && nameSpan.contains(last.getSpan()))
                commonP.insert(new Parse(commonP.getText(), commonP.getSpan(), tag, 1.0));
            }
          }
        }
      }
    }
  }
Beispiel #7
0
 public String[] tokenize(String s) {
   return Span.spansToStrings(tokenizePos(s), s);
 }