private List<Tree> helper(List<Tree> treeList, int start) { List<Tree> newTreeList = new ArrayList<Tree>(treeList.size()); for (Tree tree : treeList) { int end = start + tree.yield().size(); newTreeList.add(prune(tree, start)); start = end; } return newTreeList; }
// TODO: Rewrite this as general matching predicate private static boolean hasV(List tags) { for (int i = 0, tsize = tags.size(); i < tsize; i++) { String str = tags.get(i).toString(); if (str.startsWith("V")) { return true; } } return false; }
private static List<TaggedWord> cleanTags(List twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<TaggedWord>(sz); for (int i = 0; i < sz; i++) { TaggedWord tw = (TaggedWord) twList.get(i); TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
List<Tree> prune(List<Tree> treeList, Label label, int start, int end) { // get reference tree if (treeList.size() == 1) { return treeList; } Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList); int goal = Numberer.getGlobalNumberer("states").number(label.value()); Tree tempTree = parser.extractBestParse(goal, start, end); // parser.restoreUnaries(tempTree); Tree pcfgTree = debinarizer.transformTree(tempTree); Set<Constituent> pcfgConstituents = pcfgTree.constituents(new LabeledScoredConstituentFactory()); // delete child labels that are not in reference but do not cross reference List<Tree> prunedChildren = new ArrayList<Tree>(); int childStart = 0; for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) { Tree child = testTree.getChild(c); boolean isExtra = true; int childEnd = childStart + child.yield().size(); Constituent childConstituent = new LabeledScoredConstituent(childStart, childEnd, child.label(), 0); if (pcfgConstituents.contains(childConstituent)) { isExtra = false; } if (childConstituent.crosses(pcfgConstituents)) { isExtra = false; } if (child.isLeaf() || child.isPreTerminal()) { isExtra = false; } if (pcfgTree.yield().size() != testTree.yield().size()) { isExtra = false; } if (!label.value().startsWith("NP^NP")) { isExtra = false; } if (isExtra) { System.err.println( "Pruning: " + child.label() + " from " + (childStart + start) + " to " + (childEnd + start)); System.err.println("Was: " + testTree + " vs " + pcfgTree); prunedChildren.addAll(child.getChildrenAsList()); } else { prunedChildren.add(child); } childStart = childEnd; } return prunedChildren; }
protected String historyToString(List history) { String str = (String) historyToString.get(history); if (str == null) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < history.size(); i++) { sb.append('^'); sb.append(history.get(i)); } str = sb.toString(); historyToString.put(history, str); } return str; }
public Object formResult() { Set brs = new HashSet(); Set urs = new HashSet(); // scan each rule / history pair int ruleCount = 0; for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) { if (ruleCount % 100 == 0) { System.err.println("Rules multiplied: " + ruleCount); } ruleCount++; Pair rulePair = (Pair) pairI.next(); Rule baseRule = (Rule) rulePair.first; String baseLabel = (String) ruleToLabel.get(baseRule); List history = (List) rulePair.second; double totalProb = 0; for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) { List subHistory = history.subList(0, depth); double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory)); double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory)); // System.out.println("Multiplying out "+baseRule+" with history "+subHistory); // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label); // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule ); double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label); totalProb += prob; for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) { Rule rule = specifyRule(baseRule, subHistory, childDepth); rule.score = (float) Math.log(totalProb); // System.out.println("Created "+rule+" with score "+rule.score); if (rule instanceof UnaryRule) { urs.add(rule); } else { brs.add(rule); } } } } System.out.println("Total states: " + stateNumberer.total()); BinaryGrammar bg = new BinaryGrammar(stateNumberer.total()); UnaryGrammar ug = new UnaryGrammar(stateNumberer.total()); for (Iterator brI = brs.iterator(); brI.hasNext(); ) { BinaryRule br = (BinaryRule) brI.next(); bg.addRule(br); } for (Iterator urI = urs.iterator(); urI.hasNext(); ) { UnaryRule ur = (UnaryRule) urI.next(); ug.addRule(ur); } return new Pair(ug, bg); }
protected void tallyInternalNode(Tree lt, List parents) { // form base rule String label = lt.label().value(); Rule baseR = ltToRule(lt); ruleToLabel.put(baseR, label); // act on each history depth for (int depth = 0, maxDepth = Math.min(HISTORY_DEPTH(), parents.size()); depth <= maxDepth; depth++) { List history = new ArrayList(parents.subList(0, depth)); // tally each history level / rewrite pair rulePairs.incrementCount(new Pair(baseR, history), 1); labelPairs.incrementCount(new Pair(label, history), 1); } }
public static ArrayList<ArrayList<TaggedWord>> getPhrases(Tree parse, int phraseSizeLimit) { ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>(); List<Tree> leaves = parse.getLeaves(); if (leaves.size() <= phraseSizeLimit) { // ArrayList<TaggedWord> phraseElements = PreprocessPhrase(parse.taggedYield()); ArrayList<TaggedWord> phraseElements = Preprocess(parse.taggedYield()); if (phraseElements.size() > 0) newList.add(phraseElements); } else { Tree[] childrenNodes = parse.children(); for (int i = 0; i < childrenNodes.length; i++) { Tree currentParse = childrenNodes[i]; newList.addAll(getPhrases(currentParse, phraseSizeLimit)); } } return newList; }
/** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and parses these * out, collecting them into one operation. Stops on a whitespace line. * * @throws IOException */ public static TsurgeonPattern getTsurgeonOperationsFromReader(BufferedReader reader) throws IOException { List<TsurgeonPattern> operations = new ArrayList<TsurgeonPattern>(); for (String thisLine; (thisLine = reader.readLine()) != null; ) { if (emptyLinePattern.matcher(thisLine).matches()) { break; } thisLine = removeComments(thisLine); if (emptyLinePattern.matcher(thisLine).matches()) { continue; } // System.err.println("Read tsurgeon op: " + thisLine); operations.add(parseOperation(thisLine)); } if (operations.size() == 0) throw new TsurgeonParseException("No Tsurgeon operation provided."); return collectOperations(operations); }
protected Rule specifyRule(Rule rule, List history, int childDepth) { Rule r; String topHistoryStr = historyToString(history.subList(1, history.size())); String bottomHistoryStr = historyToString(history.subList(0, childDepth)); if (rule instanceof UnaryRule) { UnaryRule ur = new UnaryRule(); UnaryRule urule = (UnaryRule) rule; ur.parent = stateNumberer.number(stateNumberer.object(urule.parent) + topHistoryStr); if (isSynthetic(urule.child)) { ur.child = stateNumberer.number(stateNumberer.object(urule.child) + topHistoryStr); } else if (isTag(urule.child)) { ur.child = urule.child; } else { ur.child = stateNumberer.number(stateNumberer.object(urule.child) + bottomHistoryStr); } r = ur; } else { BinaryRule br = new BinaryRule(); BinaryRule brule = (BinaryRule) rule; br.parent = stateNumberer.number(stateNumberer.object(brule.parent) + topHistoryStr); if (isSynthetic(brule.leftChild)) { br.leftChild = stateNumberer.number(stateNumberer.object(brule.leftChild) + topHistoryStr); } else if (isTag(brule.leftChild)) { br.leftChild = brule.leftChild; } else { br.leftChild = stateNumberer.number(stateNumberer.object(brule.leftChild) + bottomHistoryStr); } if (isSynthetic(brule.rightChild)) { br.rightChild = stateNumberer.number(stateNumberer.object(brule.rightChild) + topHistoryStr); } else if (isTag(brule.rightChild)) { br.rightChild = brule.rightChild; } else { br.rightChild = stateNumberer.number(stateNumberer.object(brule.rightChild) + bottomHistoryStr); } r = br; } return r; }
/** * Collects a list of operation patterns into a sequence of operations to be applied. Required to * keep track of global properties across a sequence of operations. For example, if you want to * insert a named node and then coindex it with another node, you will need to collect the * insertion and coindexation operations into a single TsurgeonPattern so that tsurgeon is aware * of the name of the new node and coindexation becomes possible. * * @param patterns a list of {@link TsurgeonPattern} operations that you want to collect together * into a single compound operation * @return a new {@link TsurgeonPattern} that performs all the operations in the sequence of the * <code>patterns</code> argument */ public static TsurgeonPattern collectOperations(List<TsurgeonPattern> patterns) { return new TsurgeonPatternRoot(patterns.toArray(new TsurgeonPattern[patterns.size()])); }