/** * Construct a fall through tree in case we can't parse this sentence * * @param words * @return a tree with X for all the internal nodes */ public static Tree xTree(List<? extends HasWord> words) { TreeFactory lstf = new LabeledScoredTreeFactory(); List<Tree> lst2 = new ArrayList<Tree>(); for (HasWord obj : words) { String s = obj.word(); Tree t = lstf.newLeaf(s); Tree t2 = lstf.newTreeNode("X", Collections.singletonList(t)); lst2.add(t2); } return lstf.newTreeNode("X", lst2); }
@SuppressWarnings("OverlyStrongTypeCast") private static String getString(Object o) { if (o instanceof HasWord) { HasWord h = (HasWord) o; return h.word(); } else if (o instanceof String) { return (String) o; } else if (o instanceof CoreMap) { return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class); } else { throw new RuntimeException("Expected token to be either Word or String."); } }
/** * Turns a sentence into a flat phrasal tree. The structure is S -> tag*. And then each tag goes * to a word. The tag is either found from the label or made "WD". The tag and phrasal node have a * StringLabel. * * @param s The Sentence to make the Tree from * @param lf The LabelFactory with which to create the new Tree labels * @return The one phrasal level Tree */ public static Tree toFlatTree(Sentence<?> s, LabelFactory lf) { List<Tree> daughters = new ArrayList<Tree>(s.length()); for (HasWord word : s) { Tree wordNode = new LabeledScoredTreeLeaf(lf.newLabel(word.word())); if (word instanceof TaggedWord) { TaggedWord taggedWord = (TaggedWord) word; wordNode = new LabeledScoredTreeNode( new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode)); } else { wordNode = new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode)); } daughters.add(wordNode); } return new LabeledScoredTreeNode(new StringLabel("S"), daughters); }
/** * Returns a List of Lists where each element is built from a run of Words in the input Document. * Specifically, reads through each word in the input document and breaks off a sentence after * finding a valid sentence boundary token or end of file. Note that for this to work, the words * in the input document must have been tokenized with a tokenizer that makes sentence boundary * tokens their own tokens (e.g., {@link PTBTokenizer}). * * @param words A list of already tokenized words (must implement HasWord or be a String) * @return A list of Sentence * @see #WordToSentenceProcessor(String, Set, Set, Pattern, Pattern) */ public List<List<IN>> wordsToSentences(List<? extends IN> words) { List<List<IN>> sentences = Generics.newArrayList(); List<IN> currentSentence = null; List<IN> lastSentence = null; boolean insideRegion = false; for (IN o : words) { String word; if (o instanceof HasWord) { HasWord h = (HasWord) o; word = h.word(); } else if (o instanceof String) { word = (String) o; } else if (o instanceof CoreMap) { word = ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class); } else { throw new RuntimeException("Expected token to be either Word or String."); } boolean forcedEnd = false; if (o instanceof CoreMap) { Boolean forcedEndValue = ((CoreMap) o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class); if (forcedEndValue != null) forcedEnd = forcedEndValue; } if (DEBUG) { EncodingPrintWriter.err.println("Word is " + word, "UTF-8"); } if (currentSentence == null) { currentSentence = new ArrayList<IN>(); } if (sentenceRegionBeginPattern != null && !insideRegion) { if (sentenceRegionBeginPattern.matcher(word).matches()) { insideRegion = true; } if (DEBUG) { System.err.println(" outside region"); } continue; } if (sentenceBoundaryFollowers.contains(word) && lastSentence != null && currentSentence.isEmpty()) { lastSentence.add(o); if (DEBUG) { System.err.println(" added to last"); } } else { boolean newSent = false; if (matchesSentenceBoundaryToDiscard(word)) { newSent = true; } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) { insideRegion = false; newSent = true; } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) { currentSentence.add(o); if (DEBUG) { System.err.println(" is sentence boundary; added to current"); } newSent = true; } else if (forcedEnd) { currentSentence.add(o); newSent = true; if (DEBUG) { System.err.println(" annotated to be the end of a sentence"); } } else { currentSentence.add(o); if (DEBUG) { System.err.println(" added to current"); } } if (newSent && currentSentence.size() > 0) { if (DEBUG) { System.err.println(" beginning new sentence"); } sentences.add(currentSentence); // adds this sentence now that it's complete lastSentence = currentSentence; currentSentence = null; // clears the current sentence } } } // add any words at the end, even if there isn't a sentence // terminator at the end of file if (currentSentence != null && currentSentence.size() > 0) { sentences.add(currentSentence); // adds last sentence } return sentences; }