Пример #1
0
  /**
   * Depth-first (post-order) search through the tree, recording the stack state as the lineage
   * every time a terminal is reached.
   *
   * <p>This implementation uses the Index annotation to store depth. If CoreLabels are not present
   * in the trees (or at least something that implements HasIndex), an exception will result.
   *
   * @param t The tree
   * @return A list of lineages
   */
  private static List<List<CoreLabel>> makeLineages(final Tree t) {
    if (t == null) return null;

    ((HasIndex) t.label()).setIndex(0);

    final Stack<Tree> treeStack = new Stack<>();
    treeStack.push(t);

    final Stack<CoreLabel> labelStack = new Stack<>();
    CoreLabel rootLabel = new CoreLabel(t.label());
    rootLabel.setIndex(0);
    labelStack.push(rootLabel);

    final List<List<CoreLabel>> lineages = new ArrayList<>();

    while (!treeStack.isEmpty()) {
      Tree node = treeStack.pop();
      int nodeDepth = ((HasIndex) node.label()).index();
      while (!labelStack.isEmpty() && labelStack.peek().index() != nodeDepth - 1) labelStack.pop();

      if (node.isPreTerminal()) {
        List<CoreLabel> lin = new ArrayList<>(labelStack);
        lineages.add(lin);

      } else {
        for (Tree kid : node.children()) {
          ((HasIndex) kid.label()).setIndex(nodeDepth + 1);
          treeStack.push(kid);
        }
        CoreLabel nodeLabel = new CoreLabel(node.label());
        nodeLabel.setIndex(nodeDepth);
        labelStack.add(nodeLabel);
      }
    }

    if (DEBUG) {
      System.out.println("Lineages:");
      for (List<CoreLabel> lin : lineages) {
        for (CoreLabel cl : lin) System.out.print(cl.value() + " <- ");
        System.out.println();
      }
    }

    return lineages;
  }
 /**
  * Create a mock node, to be added to the dependency tree but which is not part of the original
  * sentence.
  *
  * @param toCopy The CoreLabel to copy from initially.
  * @param word The new word to add.
  * @param POS The new part of speech to add.
  * @return A CoreLabel copying most fields from toCopy, but with a new word and POS tag (as well
  *     as a new index).
  */
 @SuppressWarnings("UnusedDeclaration")
 private CoreLabel mockNode(CoreLabel toCopy, String word, String POS) {
   CoreLabel mock = new CoreLabel(toCopy);
   mock.setWord(word);
   mock.setLemma(word);
   mock.setValue(word);
   mock.setNER("O");
   mock.setTag(POS);
   mock.setIndex(sentenceLength + 5);
   return mock;
 }
Пример #3
0
 private static int reIndexLeaves(Tree t, int startIndex) {
   if (t.isLeaf()) {
     CoreLabel afl = (CoreLabel) t.label();
     afl.setIndex(startIndex);
     startIndex++;
   } else {
     for (Tree child : t.children()) {
       startIndex = reIndexLeaves(child, startIndex);
     }
   }
   return startIndex;
 }
  /**
   * If setCountLineNumbers is set to true, we count line numbers by telling the underlying splitter
   * to return empty lists of tokens and then treating those empty lists as empty lines. We don't
   * actually include empty sentences in the annotation, though.
   */
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Sentence splitting ...");
    }
    if (!annotation.has(CoreAnnotations.TokensAnnotation.class)) {
      throw new IllegalArgumentException(
          "WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }

    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // System.err.println("Tokens are: " + tokens);

    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    for (List<CoreLabel> sentenceTokens : this.wts.process(tokens)) {
      if (countLineNumbers) {
        ++lineNumber;
      }
      if (sentenceTokens.isEmpty()) {
        if (!countLineNumbers) {
          throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
        } else {
          continue;
        }
      }

      // get the sentence text from the first and last character offsets
      int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int last = sentenceTokens.size() - 1;
      int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      String sentenceText = text.substring(begin, end);

      // create a sentence annotation with text and token offsets
      Annotation sentence = new Annotation(sentenceText);
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
      sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
      tokenOffset += sentenceTokens.size();
      sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
      sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());

      if (countLineNumbers) {
        sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
      }

      // Annotate sentence with section information.
      // Assume section start and end appear as first and last tokens of sentence
      CoreLabel sentenceStartToken = sentenceTokens.get(0);
      CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);

      CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
      if (sectionStart != null) {
        // Section is started
        sectionAnnotations = sectionStart;
      }
      if (sectionAnnotations != null) {
        // transfer annotations over to sentence
        ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
      }
      String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
      if (sectionEnd != null) {
        sectionAnnotations = null;
      }

      if (docID != null) {
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
      }

      int index = 1;
      for (CoreLabel token : sentenceTokens) {
        token.setIndex(index++);
        token.setSentIndex(sentences.size());
        if (docID != null) {
          token.setDocID(docID);
        }
      }

      // add the sentence to the list
      sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
    /*
    if (tokenOffset != tokens.size()) {
      throw new RuntimeException(String.format(
          "expected %d tokens, found %d", tokens.size(), tokenOffset));
    }
    */

    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }