/** * Depth-first (post-order) search through the tree, recording the stack state as the lineage * every time a terminal is reached. * * <p>This implementation uses the Index annotation to store depth. If CoreLabels are not present * in the trees (or at least something that implements HasIndex), an exception will result. * * @param t The tree * @return A list of lineages */ private static List<List<CoreLabel>> makeLineages(final Tree t) { if (t == null) return null; ((HasIndex) t.label()).setIndex(0); final Stack<Tree> treeStack = new Stack<>(); treeStack.push(t); final Stack<CoreLabel> labelStack = new Stack<>(); CoreLabel rootLabel = new CoreLabel(t.label()); rootLabel.setIndex(0); labelStack.push(rootLabel); final List<List<CoreLabel>> lineages = new ArrayList<>(); while (!treeStack.isEmpty()) { Tree node = treeStack.pop(); int nodeDepth = ((HasIndex) node.label()).index(); while (!labelStack.isEmpty() && labelStack.peek().index() != nodeDepth - 1) labelStack.pop(); if (node.isPreTerminal()) { List<CoreLabel> lin = new ArrayList<>(labelStack); lineages.add(lin); } else { for (Tree kid : node.children()) { ((HasIndex) kid.label()).setIndex(nodeDepth + 1); treeStack.push(kid); } CoreLabel nodeLabel = new CoreLabel(node.label()); nodeLabel.setIndex(nodeDepth); labelStack.add(nodeLabel); } } if (DEBUG) { System.out.println("Lineages:"); for (List<CoreLabel> lin : lineages) { for (CoreLabel cl : lin) System.out.print(cl.value() + " <- "); System.out.println(); } } return lineages; }
/** * Create a mock node, to be added to the dependency tree but which is not part of the original * sentence. * * @param toCopy The CoreLabel to copy from initially. * @param word The new word to add. * @param POS The new part of speech to add. * @return A CoreLabel copying most fields from toCopy, but with a new word and POS tag (as well * as a new index). */ @SuppressWarnings("UnusedDeclaration") private CoreLabel mockNode(CoreLabel toCopy, String word, String POS) { CoreLabel mock = new CoreLabel(toCopy); mock.setWord(word); mock.setLemma(word); mock.setValue(word); mock.setNER("O"); mock.setTag(POS); mock.setIndex(sentenceLength + 5); return mock; }
private static int reIndexLeaves(Tree t, int startIndex) { if (t.isLeaf()) { CoreLabel afl = (CoreLabel) t.label(); afl.setIndex(startIndex); startIndex++; } else { for (Tree child : t.children()) { startIndex = reIndexLeaves(child, startIndex); } } return startIndex; }
/** * If setCountLineNumbers is set to true, we count line numbers by telling the underlying splitter * to return empty lists of tokens and then treating those empty lists as empty lines. We don't * actually include empty sentences in the annotation, though. */ @Override public void annotate(Annotation annotation) { if (VERBOSE) { System.err.print("Sentence splitting ..."); } if (!annotation.has(CoreAnnotations.TokensAnnotation.class)) { throw new IllegalArgumentException( "WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation); } // get text and tokens from the document String text = annotation.get(CoreAnnotations.TextAnnotation.class); List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); // System.err.println("Tokens are: " + tokens); // assemble the sentence annotations int tokenOffset = 0; int lineNumber = 0; // section annotations to mark sentences with CoreMap sectionAnnotations = null; List<CoreMap> sentences = new ArrayList<CoreMap>(); for (List<CoreLabel> sentenceTokens : this.wts.process(tokens)) { if (countLineNumbers) { ++lineNumber; } if (sentenceTokens.isEmpty()) { if (!countLineNumbers) { throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens); } else { continue; } } // get the sentence text from the first and last character offsets int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int last = sentenceTokens.size() - 1; int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); String sentenceText = text.substring(begin, end); // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens); sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset); tokenOffset += sentenceTokens.size(); sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset); sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size()); if (countLineNumbers) { sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber); } // Annotate sentence with section information. // Assume section start and end appear as first and last tokens of sentence CoreLabel sentenceStartToken = sentenceTokens.get(0); CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1); CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class); if (sectionStart != null) { // Section is started sectionAnnotations = sectionStart; } if (sectionAnnotations != null) { // transfer annotations over to sentence ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence); } String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class); if (sectionEnd != null) { sectionAnnotations = null; } if (docID != null) { sentence.set(CoreAnnotations.DocIDAnnotation.class, docID); } int index = 1; for (CoreLabel token : sentenceTokens) { token.setIndex(index++); token.setSentIndex(sentences.size()); if (docID != null) { token.setDocID(docID); } } // add the sentence to the list sentences.add(sentence); } // the condition below is possible if sentenceBoundaryToDiscard is initialized! /* if (tokenOffset != tokens.size()) { throw new RuntimeException(String.format( "expected %d tokens, found %d", tokens.size(), tokenOffset)); } */ // add the sentences annotations to the document annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); }