/** * Returns the sentence from its tree representation. * * @param t the tree representation of the sentence * @return the sentence */ public static String tree2Words(Tree t) { StringBuilder buffer = new StringBuilder(); List<Tree> leaves = t.getLeaves(); for (Tree leaf : leaves) { String word = ((CoreLabel) leaf.label()).get(CoreAnnotations.ValueAnnotation.class); // TODO maybe double check preceding whitespaces, because transformations could have // resulted in the situation that the trailing // whitespaces of out last tokens is not the same as the preceding whitespaces of out // current token BUT: This has also to be done in getTokenListFromTree(...) // now add the trailing whitespaces String trailingWhitespaces = ((CoreLabel) leaf.label()).get(CoreAnnotations.AfterAnnotation.class); // if no whitespace-info is available, insert a whitespace this may happen for nodes // inserted by TSurgeon operations if (trailingWhitespaces == null) { trailingWhitespaces = " "; } buffer.append(word).append(trailingWhitespaces); } return buffer.toString(); }
/** * terse representation of a (sub-)tree: NP[the white dog] -vs- (NP (DT the) (JJ white) (NN dog)) */ public static String abbrevTree(Tree tree) { ArrayList<String> toks = new ArrayList(); for (Tree L : tree.getLeaves()) { toks.add(L.label().toString()); } return tree.label().toString() + "[" + StringUtils.join(toks, " ") + "]"; }
private static void printSentenceParseTree(String sentence) { System.out.println( "ParsedSentence being analyzed: \"" + sentence + "\"\n---------------------------"); sentence = sentence.replaceAll("\\.", ""); final ParsedSentence parsedSentence = StanfordCoreNlpClient.parseSentence(sentence, false); final Tree tree = parsedSentence.getPosTree(); final List<Tree> trees = tree.getChild(0).getChildrenAsList(); for (final Tree part : trees) { System.out.print(part.label()); System.out.println(part); } tree.indentedListPrint(); System.out.println(); final List<Tree> leaves = tree.getLeaves(); for (final Tree leaf : leaves) { System.out.printf("(%s - %s), ", leaf.parent(tree).label(), leaf); } System.out.println("\n"); System.out.println(parsedSentence.getDependencies()); }
/** * This method creates a string which represents the part of the sentence this <code>tree</code> * stands for. * * @param tree A (partial) syntax tree * @return The original sentence part */ public static String printTree(Tree tree) { final StringBuilder sb = new StringBuilder(); for (final Tree t : tree.getLeaves()) { sb.append(t.toString()).append(" "); } return sb.toString().trim(); }
private static String toString(Tree tree, boolean plainPrint) { if (!plainPrint) return tree.toString(); StringBuilder sb = new StringBuilder(); List<Tree> leaves = tree.getLeaves(); for (Tree leaf : leaves) sb.append(((CoreLabel) leaf.label()).value()).append(' '); return sb.toString(); }
public Tense calculateTense(String clause) { final Tree posTree = getPosTree(clause); final Tree word = posTree.getLeaves().get(0); final String pos = word.parent(posTree).label().value().toLowerCase(); if (pos.equals("md")) { return Tense.FUTURE; } if (pos.equals("vbd") || pos.equals("vbn")) { return Tense.PAST; } return Tense.PRESENT; }
private LabeledSentence generateSupersenseTaggingInput(Tree sentence) { LabeledSentence res = new LabeledSentence(); List<Tree> leaves = sentence.getLeaves(); for (int i = 0; i < leaves.size(); i++) { String word = leaves.get(i).label().toString(); Tree preterm = leaves.get(i).parent(sentence); String pos = preterm.label().toString(); String stem = AnalysisUtilities.getInstance().getLemma(word, pos); res.addToken(word, stem, pos, "0"); } return res; }
public static ArrayList<ArrayList<TaggedWord>> getPhrases(Tree parse, int phraseSizeLimit) { ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>(); List<Tree> leaves = parse.getLeaves(); if (leaves.size() <= phraseSizeLimit) { // ArrayList<TaggedWord> phraseElements = PreprocessPhrase(parse.taggedYield()); ArrayList<TaggedWord> phraseElements = Preprocess(parse.taggedYield()); if (phraseElements.size() > 0) newList.add(phraseElements); } else { Tree[] childrenNodes = parse.children(); for (int i = 0; i < childrenNodes.length; i++) { Tree currentParse = childrenNodes[i]; newList.addAll(getPhrases(currentParse, phraseSizeLimit)); } } return newList; }
/** * Returns a list of Token annotations from a Tree-object * * @param aJCas a JCas. * @param t a tree. * @return the tokens. */ public static List<Token> getTokenListFromTree(JCas aJCas, Tree t) { List<Token> tokenList = new ArrayList<Token>(); int index = 0; for (Tree leaf : t.getLeaves()) { String word = ((CoreLabel) leaf.label()).get(CoreAnnotations.ValueAnnotation.class); tokenList.add(new Token(aJCas, index, index + word.length())); // get trailing whitespaces to calculate next index String whiteSpaces = ((CoreLabel) leaf.label()).get(CoreAnnotations.AfterAnnotation.class); if (whiteSpaces == null) { whiteSpaces = " "; } index += word.length() + whiteSpaces.length(); } return tokenList; }
void initRandomWordVectors(List<Tree> trainingTrees) { if (op.numHid == 0) { throw new RuntimeException("Cannot create random word vectors for an unknown numHid"); } Set<String> words = Generics.newHashSet(); words.add(UNKNOWN_WORD); for (Tree tree : trainingTrees) { List<Tree> leaves = tree.getLeaves(); for (Tree leaf : leaves) { String word = leaf.label().value(); if (op.lowercaseWordVectors) { word = word.toLowerCase(); } words.add(word); } } this.wordVectors = Generics.newTreeMap(); for (String word : words) { SimpleMatrix vector = randomWordVector(); wordVectors.put(word, vector); } }
private String getHeadNoun(String uri) { String[] tokens = lexicalize(uri); // if we have multiple tokens, get the head noun String head; if (tokens.length > 1) { head = Joiner.on(" ").join(tokens); Annotation document = new Annotation(head); pipeline.annotate(document); CoreMap sentence = document.get(SentencesAnnotation.class).get(0); Tree tree = sentence.get(TreeAnnotation.class); Tree headTree = headFinder.determineHead(tree); // we assume that the last occurring NN is the head noun List<Tree> leaves = headTree.getLeaves(); head = leaves.get(leaves.size() - 1).label().value(); } else { head = tokens[0]; } return head; }
/** TODO: clearly this should be a default method in ParserQuery once Java 8 comes out */ @Override public void restoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } List<Tree> leaves = tree.getLeaves(); if (leaves.size() != originalSentence.size()) { throw new IllegalStateException( "originalWords and sentence of different sizes: " + originalSentence.size() + " vs. " + leaves.size() + "\n Orig: " + Sentence.listToString(originalSentence) + "\n Pars: " + Sentence.listToString(leaves)); } // TODO: get rid of this cast Iterator<? extends Label> wordsIterator = (Iterator<? extends Label>) originalSentence.iterator(); for (Tree leaf : leaves) { leaf.setLabel(wordsIterator.next()); } }
public Tree getSyntacticHeadTree() { Tree tree = sentence.get(TreeAnnotation.class); return tree.getLeaves().get(syntacticHeadTokenPosition); }
/** * This method searches for an index word in a sentence tree * * @param wordToFind * @param treeToSearch * @param expectedPOS The expected POS tag for the result. If this is NULL, the method tries to * find a phrase. * @param canGoUp If TRUE the method will walk up the tree to find a phrase. * @param skip Set to "1" if you want to find the phrase for "in front of". Set to "0" otherwise. * @return The largest matching tree. */ public static Tree match( IndexedWord wordToFind, Tree treeToSearch, String expectedPOS, boolean canGoUp, int skip) { int end = wordToFind.get(EndIndexAnnotation.class); int begin = wordToFind.get(BeginIndexAnnotation.class); // first, find whatever is at the word's index for (Tree tree : treeToSearch) { CoreLabel lbl = ((CoreLabel) tree.label()); if (lbl != null && lbl.get(EndIndexAnnotation.class) != null && lbl.get(EndIndexAnnotation.class) == end) { if (lbl.get(BeginIndexAnnotation.class) == begin) { // we found the first subtree at the word's index // now, check if the word here is our searchword if (tree.getLeaves().get(0).label().value().equals(wordToFind.value())) { // we have found the label. Tree candidate = tree; if (expectedPOS != null) { // if we know our desired POS, just keep walking up the tree to find the first // instance of the expected pos while (!expectedPOS.equals(candidate.value())) { // if we don't have the right POS, just try our parent candidate = candidate.parent(treeToSearch); if (candidate == null) { return null; } } candidate = skip(candidate, treeToSearch, expectedPOS, skip); } else { // else walk up the tree again to find the corresponding phrase while (!candidate.isPhrasal()) { candidate = candidate.parent(treeToSearch); // edu.stanford.nlp.trees.Tree.parent(Tree root) if (candidate == null) { return null; } } } if (canGoUp) { // now keep walking as long as the phrase does not change. this should yield the // largest representative phrase for this word. String phrase = candidate.value(); while (phrase.equals(candidate.parent(treeToSearch).value())) { candidate = candidate.parent(treeToSearch); if (candidate == null) { return null; } } } return candidate; } } } } return null; }
public List<String> annotateSentenceWithSupersenses(Tree sentence) { List<String> result = new ArrayList<String>(); int numleaves = sentence.getLeaves().size(); if (numleaves <= 1) { return result; } LabeledSentence labeled = generateSupersenseTaggingInput(sentence); // see if a NER socket server is available int port = new Integer(ARKref.getProperties().getProperty("supersenseServerPort", "5557")); String host = "127.0.0.1"; Socket client; PrintWriter pw; BufferedReader br; String line; try { client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); String inputStr = ""; for (int i = 0; i < labeled.length(); i++) { String token = labeled.getTokens().get(i); String stem = labeled.getStems().get(i); String pos = labeled.getPOS().get(i); inputStr += token + "\t" + stem + "\t" + pos + "\n"; } pw.println(inputStr); pw.flush(); // flush to complete the transmission while ((line = br.readLine()) != null) { String[] parts = line.split("\\t"); result.add(parts[2]); } br.close(); pw.close(); client.close(); } catch (Exception ex) { if (ARKref.Opts.debug) System.err.println("Could not connect to SST server."); // ex.printStackTrace(); } // if socket server not available, then use a local NER object if (result.size() == 0) { try { if (sst == null) { DiscriminativeTagger.loadProperties(ARKref.getPropertiesPath()); sst = DiscriminativeTagger.loadModel( ARKref.getProperties() .getProperty("supersenseModelFile", "config/supersenseModel.ser.gz")); } sst.findBestLabelSequenceViterbi(labeled, sst.getWeights()); for (String pred : labeled.getPredictions()) { result.add(pred); } } catch (Exception e) { e.printStackTrace(); } } // add a bunch of blanks if necessary while (result.size() < numleaves) result.add("0"); if (ARKref.Opts.debug) System.err.println("annotateSentenceSST: " + result); return result; }
private PropertyList addLexicoSyntacticFeatures( PropertyList pl, Document doc, Pair<Integer, Integer> candidate, int arg2Line, int arg2HeadPos, int connStart, int connEnd) { int arg1Line = candidate.first(); Tree root = doc.getTree(arg1Line); int arg1HeadPos = candidate.second(); boolean attributive = false; String head = root.getLeaves().get(arg1HeadPos).value(); for (String verb : attributiveVerb) { if (head.matches(verb)) { attributive = true; break; } } pl = PropertyList.add("U=" + attributive, 1.0, pl); SimpleDepGraph depGraph = doc.getDepGraph(arg1Line); boolean hasClausalComp = false; List<SimpleDependency> govDependencies = depGraph.getGovDependencies(arg1HeadPos); for (SimpleDependency dep : govDependencies) { if (dep.reln().equals("ccomp")) { hasClausalComp = true; break; } } pl = PropertyList.add("V=" + hasClausalComp, 1.0, pl); pl = PropertyList.add("W=" + attributive + "&" + hasClausalComp, 1.0, pl); boolean isClausalComp = false; List<SimpleDependency> depDependencies = depGraph.getDepDependencies(arg1HeadPos); SimpleDependency clausalComp = null; for (SimpleDependency dep : depDependencies) { if (dep.reln().equals("ccomp")) { isClausalComp = true; clausalComp = dep; break; } } pl = PropertyList.add("X=" + isClausalComp, 1.0, pl); if (isClausalComp) { int gov = clausalComp.gov(); String govWord = root.getLeaves().get(gov).value(); boolean isGovAttributive = false; for (String verb : attributiveVerb) { if (govWord.matches(verb)) { isGovAttributive = true; break; } } pl = PropertyList.add("Y=" + isClausalComp + "&" + isGovAttributive, 1.0, pl); } return pl; }
private PropertyList addConstituentFeatures( PropertyList pl, Document doc, Pair<Integer, Integer> candidate, int arg2Line, int arg2HeadPos, int connStart, int connEnd) { Sentence arg2Sentence = doc.getSentence(arg2Line); String conn = arg2Sentence.toString(connStart, connEnd); int connHeadPos = connAnalyzer.getHeadWord(arg2Sentence.getParseTree(), connStart, connEnd); int arg1Line = candidate.first(); Tree arg1Tree = doc.getTree(arg1Line); int arg1HeadPos = candidate.second(); List<String> path = new ArrayList<String>(); List<String> pathWithoutPOS = new ArrayList<String>(); if (arg1Line == arg2Line) { Tree root = arg1Tree; List<Tree> leaves = root.getLeaves(); List<Tree> treePath = root.pathNodeToNode(leaves.get(connHeadPos), leaves.get(arg1HeadPos)); if (treePath != null) { for (Tree t : treePath) { if (!t.isLeaf()) { path.add(t.value()); if (!t.isPreTerminal()) { pathWithoutPOS.add(t.value()); } } } } } else { Tree arg2Root = arg2Sentence.getParseTree(); Tree mainHead = headAnalyzer.getCollinsHead(arg2Root.getChild(0)); List<Tree> leaves = arg2Root.getLeaves(); int mainHeadPos = treeAnalyzer.getLeafPosition(arg2Root, mainHead); if (mainHeadPos != -1) { List<Tree> treePath = arg2Root.pathNodeToNode(leaves.get(connHeadPos), leaves.get(mainHeadPos)); if (treePath != null) { for (Tree t : treePath) { if (!t.isLeaf()) { path.add(t.value()); if (!t.isPreTerminal()) { pathWithoutPOS.add(t.value()); } } } } } for (int i = 0; i < Math.abs(arg1Line - arg2Line); i++) { path.add("SENT"); pathWithoutPOS.add("SENT"); } Tree arg1Root = arg1Tree; mainHead = headAnalyzer.getCollinsHead(arg1Root.getChild(0)); leaves = arg1Root.getLeaves(); mainHeadPos = treeAnalyzer.getLeafPosition(arg1Root, mainHead); if (mainHeadPos != -1) { List<Tree> treePath = arg1Root.pathNodeToNode(leaves.get(mainHeadPos), leaves.get(arg1HeadPos)); if (treePath != null) { for (Tree t : treePath) { if (!t.isLeaf()) { path.add(t.value()); if (!t.isPreTerminal()) { pathWithoutPOS.add(t.value()); } } } } } } // H-full path // L-C&H StringBuilder fullPath = new StringBuilder(); for (String node : path) { fullPath.append(node).append(":"); } pl = PropertyList.add("H=" + fullPath.toString(), 1.0, pl); pl = PropertyList.add("L=CONN-" + conn + "&" + "H-" + fullPath.toString(), 1.0, pl); // I-length of path pl = PropertyList.add("I=" + path.size(), 1.0, pl); // J-collapsed path without part of speech // K-collapsed path without repititions fullPath = new StringBuilder(); StringBuilder collapsedPath = new StringBuilder(); String prev = ""; for (String node : pathWithoutPOS) { fullPath.append(node).append(":"); if (!node.equals(prev)) { collapsedPath.append(node).append(":"); } prev = node; } pl = PropertyList.add("J=" + fullPath.toString(), 1.0, pl); pl = PropertyList.add("K=" + collapsedPath.toString(), 1.0, pl); return pl; }