private static String getTreeCategory(Tree t) { String rootLabel = t.label().toString(); if (rootLabel.equals("S") && t.numChildren() == 1 && t.getChild(0).label().toString().equals("VP") && t.getChild(0).getChild(0).label().toString().equals("VBG")) return "NP"; if (rootLabel.equals("S") && t.numChildren() == 1 && t.getChild(0).label().toString().equals("VP") && t.getChild(0).getChild(0).label().toString().equals("VBN")) return "VP"; return rootLabel; }
public Tree transformTree(Tree tree) { Label lab = tree.label(); if (tree.isLeaf()) { Tree leaf = tf.newLeaf(lab); leaf.setScore(tree.score()); return leaf; } String s = lab.value(); s = treebankLanguagePack().basicCategory(s); s = treebankLanguagePack().stripGF(s); int numKids = tree.numChildren(); List<Tree> children = new ArrayList<Tree>(numKids); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.getChild(cNum); Tree newChild = transformTree(child); children.add(newChild); } CategoryWordTag newLabel = new CategoryWordTag(lab); newLabel.setCategory(s); if (lab instanceof HasTag) { String tag = ((HasTag) lab).tag(); tag = treebankLanguagePack().basicCategory(tag); tag = treebankLanguagePack().stripGF(tag); newLabel.setTag(tag); } Tree node = tf.newTreeNode(newLabel, children); node.setScore(tree.score()); return node; }
public Tree transformTree(Tree tree) { Label lab = tree.label(); if (tree.isLeaf()) { Tree leaf = tf.newLeaf(lab); leaf.setScore(tree.score()); return leaf; } String s = lab.value(); s = treebankLanguagePack().basicCategory(s); int numKids = tree.numChildren(); List<Tree> children = new ArrayList<Tree>(numKids); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.getChild(cNum); Tree newChild = transformTree(child); // cdm 2007: for just subcategory stripping, null shouldn't happen // if (newChild != null) { children.add(newChild); // } } // if (children.isEmpty()) { // return null; // } CategoryWordTag newLabel = new CategoryWordTag(lab); newLabel.setCategory(s); if (lab instanceof HasTag) { String tag = ((HasTag) lab).tag(); tag = treebankLanguagePack().basicCategory(tag); newLabel.setTag(tag); } Tree node = tf.newTreeNode(newLabel, children); node.setScore(tree.score()); return node; }
List<Tree> prune(List<Tree> treeList, Label label, int start, int end) { // get reference tree if (treeList.size() == 1) { return treeList; } Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList); int goal = Numberer.getGlobalNumberer("states").number(label.value()); Tree tempTree = parser.extractBestParse(goal, start, end); // parser.restoreUnaries(tempTree); Tree pcfgTree = debinarizer.transformTree(tempTree); Set<Constituent> pcfgConstituents = pcfgTree.constituents(new LabeledScoredConstituentFactory()); // delete child labels that are not in reference but do not cross reference List<Tree> prunedChildren = new ArrayList<Tree>(); int childStart = 0; for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) { Tree child = testTree.getChild(c); boolean isExtra = true; int childEnd = childStart + child.yield().size(); Constituent childConstituent = new LabeledScoredConstituent(childStart, childEnd, child.label(), 0); if (pcfgConstituents.contains(childConstituent)) { isExtra = false; } if (childConstituent.crosses(pcfgConstituents)) { isExtra = false; } if (child.isLeaf() || child.isPreTerminal()) { isExtra = false; } if (pcfgTree.yield().size() != testTree.yield().size()) { isExtra = false; } if (!label.value().startsWith("NP^NP")) { isExtra = false; } if (isExtra) { System.err.println( "Pruning: " + child.label() + " from " + (childStart + start) + " to " + (childEnd + start)); System.err.println("Was: " + testTree + " vs " + pcfgTree); prunedChildren.addAll(child.getChildrenAsList()); } else { prunedChildren.add(child); } childStart = childEnd; } return prunedChildren; }
// STATIC METHODS private static String dropTrailingNP(String description, Tree t) { if (t.numChildren() == 2 && t.getChild(0).label().toString().startsWith("VB") && t.getChild(1).label().toString().equals("PP") && t.getChild(1).numChildren() == 2 && t.getChild(1).getChild(0).label().toString().equals("IN") && t.getChild(1).getChild(1).label().toString().equals("NP")) { description = yield(t.getChild(0)) + " " + yield(t.getChild(1).getChild(0)); } return description; }
@Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { tree = tree.prune(hebrewEmptyFilter, tf).spliceOut(aOverAFilter, tf); // Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. // If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method // will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.value() == null || tree.value().equals("")) && tree.numChildren() <= 1) tree = tree.firstChild(); if (tree != null && !tree.value().equals(tlp.startSymbol())) tree = tf.newTreeNode(tlp.startSymbol(), Collections.singletonList(tree)); return tree; }
/** * transformTree does all language-specific tree transformations. Any parameterizations should be * inside the specific TreebankLangParserParams class. */ @Override public Tree transformTree(Tree t, Tree root) { if (t == null || t.isLeaf()) { return t; } String parentStr; String grandParentStr; Tree parent; Tree grandParent; if (root == null || t.equals(root)) { parent = null; parentStr = ""; } else { parent = t.parent(root); parentStr = parent.label().value(); } if (parent == null || parent.equals(root)) { grandParent = null; grandParentStr = ""; } else { grandParent = parent.parent(root); grandParentStr = grandParent.label().value(); } String baseParentStr = ctlp.basicCategory(parentStr); String baseGrandParentStr = ctlp.basicCategory(grandParentStr); CoreLabel lab = (CoreLabel) t.label(); String word = lab.word(); String tag = lab.tag(); String baseTag = ctlp.basicCategory(tag); String category = lab.value(); String baseCategory = ctlp.basicCategory(category); if (t.isPreTerminal()) { // it's a POS tag List<String> leftAunts = listBasicCategories(SisterAnnotationStats.leftSisterLabels(parent, grandParent)); List<String> rightAunts = listBasicCategories(SisterAnnotationStats.rightSisterLabels(parent, grandParent)); // Chinese-specific punctuation splits if (chineseSplitPunct && baseTag.equals("PU")) { if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().accept(word)) { tag = tag + "-DOU"; // System.out.println("Punct: Split dou hao"); // debugging } else if (ChineseTreebankLanguagePack.chineseCommaAcceptFilter().accept(word)) { tag = tag + "-COMMA"; // System.out.println("Punct: Split comma"); // debugging } else if (ChineseTreebankLanguagePack.chineseColonAcceptFilter().accept(word)) { tag = tag + "-COLON"; // System.out.println("Punct: Split colon"); // debugging } else if (ChineseTreebankLanguagePack.chineseQuoteMarkAcceptFilter().accept(word)) { if (chineseSplitPunctLR) { if (ChineseTreebankLanguagePack.chineseLeftQuoteMarkAcceptFilter().accept(word)) { tag += "-LQUOTE"; } else { tag += "-RQUOTE"; } } else { tag = tag + "-QUOTE"; } // System.out.println("Punct: Split quote"); // debugging } else if (ChineseTreebankLanguagePack.chineseEndSentenceAcceptFilter().accept(word)) { tag = tag + "-ENDSENT"; // System.out.println("Punct: Split end sent"); // debugging } else if (ChineseTreebankLanguagePack.chineseParenthesisAcceptFilter().accept(word)) { if (chineseSplitPunctLR) { if (ChineseTreebankLanguagePack.chineseLeftParenthesisAcceptFilter().accept(word)) { tag += "-LPAREN"; } else { tag += "-RPAREN"; } } else { tag += "-PAREN"; // printlnErr("Just used -PAREN annotation"); // printlnErr(word); // throw new RuntimeException(); } // System.out.println("Punct: Split paren"); // debugging } else if (ChineseTreebankLanguagePack.chineseDashAcceptFilter().accept(word)) { tag = tag + "-DASH"; // System.out.println("Punct: Split dash"); // debugging } else if (ChineseTreebankLanguagePack.chineseOtherAcceptFilter().accept(word)) { tag = tag + "-OTHER"; } else { printlnErr("Unknown punct (you should add it to CTLP): " + tag + " |" + word + "|"); } } else if (chineseSplitDouHao) { // only split DouHao if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().accept(word) && baseTag.equals("PU")) { tag = tag + "-DOU"; } } // Chinese-specific POS tag splits (non-punctuation) if (tagWordSize) { int l = word.length(); tag += "-" + l + "CHARS"; } if (mergeNNVV && baseTag.equals("NN")) { tag = "VV"; } if ((chineseSelectiveTagPA || chineseVerySelectiveTagPA) && (baseTag.equals("CC") || baseTag.equals("P"))) { tag += "-" + baseParentStr; } if (chineseSelectiveTagPA && (baseTag.equals("VV"))) { tag += "-" + baseParentStr; } if (markMultiNtag && tag.startsWith("N")) { for (int i = 0; i < parent.numChildren(); i++) { if (parent.children()[i].label().value().startsWith("N") && parent.children()[i] != t) { tag += "=N"; // System.out.println("Found multi=N rewrite"); } } } if (markVVsisterIP && baseTag.equals("VV")) { boolean seenIP = false; for (int i = 0; i < parent.numChildren(); i++) { if (parent.children()[i].label().value().startsWith("IP")) { seenIP = true; } } if (seenIP) { tag += "-IP"; // System.out.println("Found VV with IP sister"); // testing } } if (markPsisterIP && baseTag.equals("P")) { boolean seenIP = false; for (int i = 0; i < parent.numChildren(); i++) { if (parent.children()[i].label().value().startsWith("IP")) { seenIP = true; } } if (seenIP) { tag += "-IP"; } } if (markADgrandchildOfIP && baseTag.equals("AD") && baseGrandParentStr.equals("IP")) { tag += "~IP"; // System.out.println("Found AD with IP grandparent"); // testing } if (gpaAD && baseTag.equals("AD")) { tag += "~" + baseGrandParentStr; // System.out.println("Found AD with grandparent " + grandParentStr); // testing } if (markPostverbalP && leftAunts.contains("VV") && baseTag.equals("P")) { // System.out.println("Found post-verbal P"); tag += "^=lVV"; } // end Chinese-specific tag splits Label label = new CategoryWordTag(tag, word, tag); t.setLabel(label); } else { // it's a phrasal category Tree[] kids = t.children(); // Chinese-specific category splits List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent)); List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent)); if (paRootDtr && baseParentStr.equals("ROOT")) { category += "^ROOT"; } if (markIPsisterBA && baseCategory.equals("IP")) { if (leftSis.contains("BA")) { category += "=BA"; // System.out.println("Found IP sister of BA"); } } if (dominatesV && hasV(t.preTerminalYield())) { // mark categories containing a verb category += "-v"; } if (markIPsisterVVorP && baseCategory.equals("IP")) { // todo: cdm: is just looking for "P" here selective enough?? if (leftSis.contains("VV") || leftSis.contains("P")) { category += "=VVP"; } } if (markIPsisDEC && baseCategory.equals("IP")) { if (rightSis.contains("DEC")) { category += "=DEC"; // System.out.println("Found prenominal IP"); } } if (baseCategory.equals("VP")) { // cdm 2008: this used to just check that it startsWith("VP"), but // I think that was bad because it also matched VPT verb compounds if (chineseSplitVP == 3) { boolean hasCC = false; boolean hasPU = false; boolean hasLexV = false; for (Tree kid : kids) { if (kid.label().value().startsWith("CC")) { hasCC = true; } else if (kid.label().value().startsWith("PU")) { hasPU = true; } else if (StringUtils.lookingAt( kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) { hasLexV = true; } } if (hasCC || (hasPU && !hasLexV)) { category += "-CRD"; // System.out.println("Found coordinate VP"); // testing } else if (hasLexV) { category += "-COMP"; // System.out.println("Found complementing VP"); // testing } else { category += "-ADJT"; // System.out.println("Found adjoining VP"); // testing } } else if (chineseSplitVP >= 1) { boolean hasBA = false; for (Tree kid : kids) { if (kid.label().value().startsWith("BA")) { hasBA = true; } else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) { for (Tree kidkid : kid.children()) { if (kidkid.label().value().startsWith("BA")) { hasBA = true; } } } } if (hasBA) { category += "-BA"; } } } if (markVPadjunct && baseParentStr.equals("VP")) { // cdm 2008: This used to use startsWith("VP") but changed to baseCat Tree[] sisters = parent.children(); boolean hasVPsister = false; boolean hasCC = false; boolean hasPU = false; boolean hasLexV = false; for (Tree sister : sisters) { if (tlp.basicCategory(sister.label().value()).equals("VP")) { hasVPsister = true; } if (sister.label().value().startsWith("CC")) { hasCC = true; } if (sister.label().value().startsWith("PU")) { hasPU = true; } if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) { hasLexV = true; } } if (hasVPsister && !(hasCC || hasPU || hasLexV)) { category += "-VPADJ"; // System.out.println("Found adjunct of VP"); // testing } } if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) { if (rightSis.contains("NP")) { category += "=MODIFIERNP"; // System.out.println("Found NP modifier of NP"); // testing } } if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) { if (rightSis.isEmpty() && (leftSis.contains("ADJP") || leftSis.contains("NP") || leftSis.contains("DNP") || leftSis.contains("QP") || leftSis.contains("CP") || leftSis.contains("PP"))) { category += "=MODIFIEDNP"; // System.out.println("Found modified NP"); // testing } } if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) { if (rightSis.contains("CC") || rightSis.contains("PU") || leftSis.contains("CC") || leftSis.contains("PU")) { category += "=CONJ"; // System.out.println("Found NP conjunct"); // testing } } if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) { Tree[] sisters = parent.children(); boolean hasCommaSis = false; boolean hasIPSis = false; for (Tree sister : sisters) { if (ctlp.basicCategory(sister.label().value()).equals("PU") && ChineseTreebankLanguagePack.chineseCommaAcceptFilter() .accept(sister.children()[0].label().toString())) { hasCommaSis = true; // System.out.println("Found CommaSis"); // testing } if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) { hasIPSis = true; } } if (hasCommaSis && hasIPSis) { category += "-CONJ"; // System.out.println("Found IP conjunct"); // testing } } if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) { category += "-U"; // System.out.println("Found unary IP"); //testing } if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) { category += "-U"; // System.out.println("Found unary CP"); //testing } if (splitBaseNP && baseCategory.equals("NP")) { if (t.isPrePreTerminal()) { category = category + "-B"; } } // if (Test.verbose) printlnErr(baseCategory + " " + leftSis.toString()); //debugging if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) { // System.out.println("Found post-verbal PP"); category += "=lVV"; } if ((markADgrandchildOfIP || gpaAD) && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) { category += "^ADVP"; } if (markCC) { // was: for (int i = 0; i < kids.length; i++) { // This second version takes an idea from Collins: don't count // marginal conjunctions which don't conjoin 2 things. for (int i = 1; i < kids.length - 1; i++) { String cat2 = kids[i].label().value(); if (cat2.startsWith("CC")) { category += "-CC"; } } } Label label = new CategoryWordTag(category, word, tag); t.setLabel(label); } return t; }
/** * This method has all the ruls for generating a question * * @param bInfo * @param binaryDesc * @param posTags * @param t * @param isReversed * @param isCvt * @return */ private String generateNonCvtQuestion( FormulaGenerationInfo fgInfo, String binaryDesc, List<String> posTags, Tree t, boolean isReversed) { String res; String type1Desc = getType1Desc(fgInfo); if (type1Desc == null) // TODO hack return null; String entityDesc = fgInfo.entityInfo1.desc; String qWord = fgInfo.getQuestionWord(); binaryDesc = binaryDesc.toLowerCase(); String category = getTreeCategory(t); if (binaryDesc.endsWith("here")) { // special type of description that behaves weirdly res = handleHere(binaryDesc, isReversed, type1Desc, entityDesc, qWord); } else if (category.equals("NP") || category.equals("X") || category.contains("SBARQ") || category.equals("ADVP")) { res = handleNP(binaryDesc, isReversed, type1Desc, entityDesc, qWord); } else if (category.equals("VP") || category.equals("ADJP") || category.equals("SBAR") || category.equals("SINV")) { res = handleVP(binaryDesc, posTags, t, isReversed, type1Desc, entityDesc, qWord); } else if (category.equals("PP")) { res = handlePP(binaryDesc, isReversed, type1Desc, entityDesc, qWord); } else if (category.equals("S")) { String NP, VP; if (t.children().length == 2 && t.getChild(0).label().toString().equals("NP") && (t.getChild(1).label().toString().equals("VP") || t.getChild(1).label().toString().equals("ADJP"))) { NP = yield(t.getChild(0)); VP = yield(t.getChild(1)); } else if (t.children().length == 1 && t.getChild(0).label().toString().equals("NP") && t.getChild(0).getChild(0).label().toString().equals("NP") && t.getChild(0).numChildren() == 2) { NP = yield(t.getChild(0).getChild(0)); VP = yield(t.getChild(0).getChild(1)); } else if (t.getChild(0).label().toString().equals("VP")) { NP = ""; VP = yield(t); } else throw new RuntimeException("Unhandled S node: " + t); res = handleS(binaryDesc, isReversed, type1Desc, entityDesc, NP, VP, qWord); } else if (category.equals("FRAG")) { if (t.getChild(t.numChildren() - 1).label().toString().equals("PP")) { res = handleFinalPP(fgInfo.bInfo, binaryDesc, isReversed, type1Desc, entityDesc, qWord); } else { if (t.numChildren() == 1 && t.getChild(0).label().toString().equals("NP")) { res = handleNP(binaryDesc, isReversed, type1Desc, entityDesc, qWord); } else if (t.numChildren() == 1 && t.getChild(0).label().toString().equals("VP")) { res = handleVP( binaryDesc, posTags, t.getChild(0), isReversed, type1Desc, entityDesc, qWord); } else if (t.numChildren() == 2 && t.getChild(0).label().toString().equals("NP") && t.getChild(1).label().toString().equals("VP")) { res = handleS( binaryDesc, isReversed, type1Desc, entityDesc, yield(t.getChild(0)), yield(t.getChild(1)), qWord); } else if (posTags.contains("NP")) res = handleNP(binaryDesc, isReversed, type1Desc, entityDesc, qWord); else res = handleVP(binaryDesc, posTags, t, isReversed, type1Desc, entityDesc, qWord); } } else throw new RuntimeException("Not handling " + fgInfo.bInfo + ", category=" + category); return res; }