public Tree transformTree(Tree tree) { Label lab = tree.label(); if (tree.isLeaf()) { Tree leaf = tf.newLeaf(lab); leaf.setScore(tree.score()); return leaf; } String s = lab.value(); s = treebankLanguagePack().basicCategory(s); int numKids = tree.numChildren(); List<Tree> children = new ArrayList<Tree>(numKids); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.getChild(cNum); Tree newChild = transformTree(child); // cdm 2007: for just subcategory stripping, null shouldn't happen // if (newChild != null) { children.add(newChild); // } } // if (children.isEmpty()) { // return null; // } CategoryWordTag newLabel = new CategoryWordTag(lab); newLabel.setCategory(s); if (lab instanceof HasTag) { String tag = ((HasTag) lab).tag(); tag = treebankLanguagePack().basicCategory(tag); newLabel.setTag(tag); } Tree node = tf.newTreeNode(newLabel, children); node.setScore(tree.score()); return node; }
public Tree transformTree(Tree tree) { Label lab = tree.label(); if (tree.isLeaf()) { Tree leaf = tf.newLeaf(lab); leaf.setScore(tree.score()); return leaf; } String s = lab.value(); s = treebankLanguagePack().basicCategory(s); s = treebankLanguagePack().stripGF(s); int numKids = tree.numChildren(); List<Tree> children = new ArrayList<Tree>(numKids); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.getChild(cNum); Tree newChild = transformTree(child); children.add(newChild); } CategoryWordTag newLabel = new CategoryWordTag(lab); newLabel.setCategory(s); if (lab instanceof HasTag) { String tag = ((HasTag) lab).tag(); tag = treebankLanguagePack().basicCategory(tag); tag = treebankLanguagePack().stripGF(tag); newLabel.setTag(tag); } Tree node = tf.newTreeNode(newLabel, children); node.setScore(tree.score()); return node; }
/** * Construct a fall through tree in case we can't parse this sentence * * @param words * @return a tree with X for all the internal nodes */ public static Tree xTree(List<? extends HasWord> words) { TreeFactory lstf = new LabeledScoredTreeFactory(); List<Tree> lst2 = new ArrayList<Tree>(); for (HasWord obj : words) { String s = obj.word(); Tree t = lstf.newLeaf(s); Tree t2 = lstf.newTreeNode("X", Collections.singletonList(t)); lst2.add(t2); } return lstf.newTreeNode("X", lst2); }
/** Build a parse tree node corresponding to an elliptic node in the parse XML. */ private Tree buildEllipticNode(Node root) { Element eRoot = (Element) root; String constituentStr = eRoot.getNodeName(); List<Tree> kids = new ArrayList<>(); Tree leafNode = treeFactory.newLeaf(SpanishTreeNormalizer.EMPTY_LEAF_VALUE); if (leafNode.label() instanceof HasWord) ((HasWord) leafNode.label()).setWord(SpanishTreeNormalizer.EMPTY_LEAF_VALUE); kids.add(leafNode); Tree t = treeFactory.newTreeNode(constituentStr, kids); return t; }
public static void main(String[] args) { if (args.length < minArgs) { System.out.println(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); Language language = PropertiesUtils.get(options, "l", Language.English, Language.class); TreebankLangParserParams tlpp = language.params; DiskTreebank tb = null; String encoding = options.getProperty("l", "UTF-8"); boolean removeBracket = PropertiesUtils.getBool(options, "b", false); tlpp.setInputEncoding(encoding); tlpp.setOutputEncoding(encoding); tb = tlpp.diskTreebank(); String[] files = options.getProperty("", "").split("\\s+"); if (files.length != 0) { for (String filename : files) { tb.loadPath(filename); } } else { log.info(usage()); System.exit(-1); } PrintWriter pwo = tlpp.pw(); String startSymbol = tlpp.treebankLanguagePack().startSymbol(); TreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; for (Tree t : tb) { if (removeBracket) { if (t.value().equals(startSymbol)) { t = t.firstChild(); } } else if (!t.value().equals(startSymbol)) { // Add a bracket if it isn't already there t = tf.newTreeNode(startSymbol, Collections.singletonList(t)); } pwo.println(t.toString()); nTrees++; } pwo.close(); System.err.printf("Processed %d trees.%n", nTrees); }
public static Tree untransformTree(Tree tree) { TreeFactory tf = tree.treeFactory(); if (tree.isPrePreTerminal()) { if (tree.firstChild().label().value().matches(".*_.")) { StringBuilder word = new StringBuilder(); for (int i = 0; i < tree.children().length; i++) { Tree child = tree.children()[i]; word.append(child.firstChild().label().value()); } Tree newChild = tf.newLeaf(word.toString()); tree.setChildren(Collections.singletonList(newChild)); } } else { for (int i = 0; i < tree.children().length; i++) { Tree child = tree.children()[i]; untransformTree(child); } } return tree; }
public Tree transformTree(Tree tree) { TreeFactory tf = tree.treeFactory(); String tag = tree.label().value(); if (tree.isPreTerminal()) { String word = tree.firstChild().label().value(); List<Tree> newPreterms = new ArrayList<>(); for (int i = 0, size = word.length(); i < size; i++) { String singleCharLabel = new String(new char[] {word.charAt(i)}); Tree newLeaf = tf.newLeaf(singleCharLabel); String suffix; if (useTwoCharTags) { if (word.length() == 1 || i == 0) { suffix = "_S"; } else { suffix = "_M"; } } else { if (word.length() == 1) { suffix = "_S"; } else if (i == 0) { suffix = "_B"; } else if (i == word.length() - 1) { suffix = "_E"; } else { suffix = "_M"; } } newPreterms.add(tf.newTreeNode(tag + suffix, Collections.<Tree>singletonList(newLeaf))); } return tf.newTreeNode(tag, newPreterms); } else { List<Tree> newChildren = new ArrayList<>(); for (int i = 0; i < tree.children().length; i++) { Tree child = tree.children()[i]; newChildren.add(transformTree(child)); } return tf.newTreeNode(tag, newChildren); } }
/** * Build a parse tree node corresponding to a constituent. * * @param root Node describing the constituent * @param children Collected child nodes, already parsed */ private Tree buildConstituentNode(Node root, List<Tree> children) { Element eRoot = (Element) root; String label = eRoot.getNodeName().trim(); if (detailedAnnotations) { if (eRoot.getAttribute(ATTR_COORDINATING).equals("yes")) { label += "-coord"; } else if (eRoot.hasAttribute(ATTR_CLAUSE_TYPE)) { label += '-' + eRoot.getAttribute(ATTR_CLAUSE_TYPE); } } return treeFactory.newTreeNode(treeNormalizer.normalizeNonterminal(label), children); }
/** Build a parse tree node corresponding to the word in the given XML node. */ private Tree buildWordNode(Node root) { Element eRoot = (Element) root; String posStr = getPOS(eRoot); posStr = treeNormalizer.normalizeNonterminal(posStr); String lemma = eRoot.getAttribute(ATTR_LEMMA); String word = getWord(eRoot); String leafStr = treeNormalizer.normalizeTerminal(word); Tree leafNode = treeFactory.newLeaf(leafStr); if (leafNode.label() instanceof HasWord) ((HasWord) leafNode.label()).setWord(leafStr); if (leafNode.label() instanceof HasLemma && lemma != null) ((HasLemma) leafNode.label()).setLemma(lemma); List<Tree> kids = new ArrayList<>(); kids.add(leafNode); Tree t = treeFactory.newTreeNode(posStr, kids); if (t.label() instanceof HasTag) ((HasTag) t.label()).setTag(posStr); return t; }
@Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { tree = tree.prune(hebrewEmptyFilter, tf).spliceOut(aOverAFilter, tf); // Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. // If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method // will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.value() == null || tree.value().equals("")) && tree.numChildren() <= 1) tree = tree.firstChild(); if (tree != null && !tree.value().equals(tlp.startSymbol())) tree = tf.newTreeNode(tlp.startSymbol(), Collections.singletonList(tree)); return tree; }
public static Tree createStanfordTree(Annotation root, TreeFactory tFact) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()) .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()) .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // get POS-annotation // get the token that is covered by the POS List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation); // the POS should only cover one token assert coveredPos.size() == 1; POS pos = coveredPos.get(0); // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode}))); } return rootNode; }