/** * Loads treebank grammar from first argument and prints it. Just a demonstration of * functionality. <br> * <code>usage: java MemoryTreebank treebankFilesPath</code> * * @param args array of command-line arguments */ public static void main(String[] args) { Timing.startTime(); Treebank treebank = new MemoryTreebank( new TreeReaderFactory() { public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in); } }); treebank.loadPath(args[0]); Timing.endTime(); System.out.println(treebank); }
/** For testing: loads a treebank and prints the trees. */ public static void main(String[] args) { TreebankLangParserParams tlpp = new ChineseTreebankParserParams(); System.out.println("Default encoding is: " + tlpp.diskTreebank().encoding()); if (args.length < 2) { printlnErr( "Usage: edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams treesPath fileRange"); } else { Treebank m = tlpp.diskTreebank(); m.loadPath(args[0], new NumberRangesFileFilter(args[1], false)); for (Tree t : m) { t.pennPrint(tlpp.pw()); } System.out.println("There were " + m.size() + " trees."); } }
/** * Loads treebank grammar from first argument and prints it. Just a demonstration of * functionality. <br> * <code>usage: java MemoryTreebank treebankFilesPath</code> * * @param args array of command-line arguments */ public static void main(String[] args) { Timing.startTime(); Treebank treebank = new DiskTreebank( new TreeReaderFactory() { public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in); } }); Treebank treebank2 = new MemoryTreebank( new TreeReaderFactory() { public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in); } }); treebank.loadPath(args[0]); treebank2.loadPath(args[0]); CompositeTreebank c = new CompositeTreebank(treebank, treebank2); Timing.endTime(); TreeTransformer myTransformer = new MyTreeTransformer(); TreeTransformer myTransformer2 = new MyTreeTransformer2(); TreeTransformer myTransformer3 = new MyTreeTransformer3(); Treebank tf1 = c.transform(myTransformer).transform(myTransformer2).transform(myTransformer3); Treebank tf2 = new TransformingTreebank( new TransformingTreebank(new TransformingTreebank(c, myTransformer), myTransformer2), myTransformer3); TreeTransformer[] tta = {myTransformer, myTransformer2, myTransformer3}; TreeTransformer tt3 = new CompositeTreeTransformer(Arrays.asList(tta)); Treebank tf3 = c.transform(tt3); System.out.println("-------------------------"); System.out.println("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK"); System.out.println(c); System.out.println("-------------------------"); System.out.println("SLOWLY TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR"); Treebank tx1 = new TransformingTreebank(c, myTransformer); System.out.println(tx1); System.out.println("-----"); Treebank tx2 = new TransformingTreebank(tx1, myTransformer2); System.out.println(tx2); System.out.println("-----"); Treebank tx3 = new TransformingTreebank(tx2, myTransformer3); System.out.println(tx3); System.out.println("-------------------------"); System.out.println("TRANSFORMED TREEBANK, USING Treebank.transform()"); System.out.println(tf1); System.out.println("-------------------------"); System.out.println("PRINTING AGAIN TRANSFORMED TREEBANK, USING Treebank.transform()"); System.out.println(tf1); System.out.println("-------------------------"); System.out.println("TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR"); System.out.println(tf2); System.out.println("-------------------------"); System.out.println("TRANSFORMED TREEBANK, USING CompositeTreeTransformer"); System.out.println(tf3); System.out.println("-------------------------"); System.out.println("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK"); System.out.println(c); System.out.println("-------------------------"); } // end main
/** * Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po * matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n * * <h4>Arguments:</h4> * * Each argument should be the name of a transformation file that contains a list of pattern and * transformation operation list pairs. That is, it is a sequence of pairs of a {@link * TregexPattern} pattern on one or more lines, then a blank line (empty or whitespace), then a * list of transformation operations one per line (as specified by <b>Legal operation syntax</b> * below) to apply when the pattern is matched, and then another blank line (empty or whitespace). * Note the need for blank lines: The code crashes if they are not present as separators (although * the blank line at the end of the file can be omitted). The script file can include comment * lines, either whole comment lines or trailing comments introduced by %, which extend to the end * of line. A needed percent mark can be escaped by a preceding backslash. * * <p>For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node, * and relabel the SQ node to S, your transformation file would look like this: * * <blockquote> * * <code> * SBARQ=n1 < SQ=n2<br> * <br> * excise n1 n1<br> * relabel n2 S * </code> * * </blockquote> * * <p> * * <h4>Options:</h4> * * <ul> * <li><code>-treeFile <filename></code> specify the name of the file that has the trees * you want to transform. * <li><code>-po <matchPattern> <operation></code> Apply a single operation to * every tree using the specified match pattern and the specified operation. Use this option * when you want to quickly try the effect of one pattern/surgery combination, and are too * lazy to write a transformation file. * <li><code>-s</code> Print each output tree on one line (default is pretty-printing). * <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as * "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through * the transducer as usual. * <li><code>-encoding X</code> Uses character set X for input and output of trees. * <li><code>-macros <filename></code> A file of macros to use on the tregex pattern. * Macros should be one per line, with original and replacement separated by tabs. * <li><code>-hf <headfinder-class-name></code> use the specified {@link HeadFinder} class * to determine headship relations. * <li><code>-hfArg <string></code> pass a string argument in to the {@link HeadFinder} * class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple * arguments. * <li><code>-trf <TreeReaderFactory-class-name></code> use the specified {@link * TreeReaderFactory} class to read trees from files. * </ul> * * <h4>Legal operation syntax:</h4> * * <ul> * <li><code>delete <name></code> deletes the node and everything below it. * <li><code>prune <name></code> Like delete, but if, after the pruning, the parent has * no children anymore, the parent is pruned too. Pruning continues to affect all ancestors * until one is found with remaining children. This may result in a null tree. * <li><code>excise <name1> <name2></code> The name1 node should either dominate * or be the same as the name2 node. This excises out everything from name1 to name2. All * the children of name2 go into the parent of name1, where name1 was. * <li><code>relabel <name> <new-label></code> Relabels the node to have the new * label. <br> * There are three possible forms: <br> * <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string <br> * <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid * identifier without quoting <br> * <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based * relabeling. In this case, all matches of the regular expression against the node label * are replaced with the replacement String. This has the semantics of Java/Perl's * replaceAll: you may use capturing groups and put them in replacements with $n. For * example, if the pattern is /foo/bar/ and the node matched is "foo", the replaceAll * semantics result in "barbar". If the pattern is /^foo(.*)$/bar$1/ and node matched is * "foofoo", relabel will result in "barfoo". <br> * When using the regex replacement method, you can also use the sequences ={node} and * %{var} in the replacement string to use captured nodes or variable strings in the * replacement string. For example, if the Tregex pattern was "duck=bar" and the relabel is * /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br> * To concatenate two nodes named in the tregex pattern, for example, you can use the * pattern /^.*$/={foo}={bar}/. Note that the ^.*$ is necessary to make sure the regex * pattern only matches and replaces once on the entire node name. <br> * To get an "=" or a "%" in the replacement, using \ escaping. Also, as in the example you * can escape a slash in the middle of the second and third forms with \\/ and \\\\. <br> * <li><code>insert <name> <position></code> or <code> * insert <tree> <position></code> inserts the named node or tree into the * position specified. * <li><code>move <name> <position></code> moves the named node into the * specified position. * <p>Right now the only ways to specify position are: * <p><code>$+ <name></code> the left sister of the named node<br> * <code>$- <name></code> the right sister of the named node<br> * <code>>i <name></code> the i_th daughter of the named node<br> * <code>>-i <name></code> the i_th daughter, counting from the right, of the * named node. * <li><code>replace <name1> <name2></code> deletes name1 and inserts a copy of * name2 in its place. * <li><code>replace <name> <tree> <tree2>...</code> deletes name and * inserts the new tree(s) in its place. If more than one replacement tree is given, each of * the new subtrees will be added in order where the old tree was. Multiple subtrees at the * root is an illegal operation and will throw an exception. * <li>{@code createSubtree <new-label> <name1> [<name2>]} Create a subtree out of all the nodes * from {@code <name1>} through {@code <name2>} and puts the new subtree where that span * used to be. To limit the operation to just one node, elide {@code <name2>}. * <li><code>adjoin <auxiliary_tree> <name></code> Adjoins the specified auxiliary * tree into the named node. The daughters of the target node will become the daughters of * the foot of the auxiliary tree. * <li><code>adjoinH <auxiliary_tree> <name></code> Similar to adjoin, but * preserves the target node and makes it the root of <tree>. (It is still accessible * as <code>name</code>. The root of the auxiliary tree is ignored.) * <li><code>adjoinF <auxiliary_tree> <name></code> Similar to adjoin, but * preserves the target node and makes it the foot of <tree>. (It is still accessible * as <code>name</code>, and retains its status as parent of its children. The root of the * auxiliary tree is ignored.) * <li> * <dt><code>coindex <name1> <name2> ... <nameM> </code> Puts a (Penn * Treebank style) coindexation suffix of the form "-N" on each of nodes name_1 through * name_m. The value of N will be automatically generated in reference to the existing * coindexations in the tree, so that there is never an accidental clash of indices across * things that are not meant to be coindexed. * </ul> * * <p>In the context of <code>adjoin</code>, <code>adjoinH</code>, and <code>adjoinF</code>, an * auxiliary tree is a tree in Penn Treebank format with <code>@</code> on exactly one of the * leaves denoting the foot of the tree. The operations which use the foot use the labeled node. * For example: <br> * Tsurgeon: <code>adjoin (FOO (BAR@)) foo</code> <br> * Tregex: <code>B=foo</code> <br> * Input: <code>(A (B 1 2))</code> Output: <code>(A (FOO (BAR 1 2)))</code> * * <p>Tsurgeon applies the same operation to the same tree for as long as the given tregex * operation matches. This means that infinite loops are very easy to cause. One common situation * where this comes up is with an insert operation will repeats infinitely many times unless you * add an expression to the tregex that matches against the inserted pattern. For example, this * pattern will infinite loop: * * <blockquote> * * <code> * TregexPattern tregex = TregexPattern.compile("S=node << NP"); <br> * TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) >-1 node"); * </code> * * </blockquote> * * This pattern, though, will terminate: * * <blockquote> * * <code> * TregexPattern tregex = TregexPattern.compile("S=node << NP !<< foo"); <br> * TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) >-1 node"); * </code> * * </blockquote> * * <p>Tsurgeon has (very) limited support for conditional statements. If a pattern is prefaced * with <code>if exists <name></code>, the rest of the pattern will only execute if the * named node was found in the corresponding TregexMatcher. * * @param args a list of names of files each of which contains a single tregex matching pattern * plus a list, one per line, of transformation operations to apply to the matched pattern. * @throws Exception If an I/O or pattern syntax error */ public static void main(String[] args) throws Exception { String headFinderClassName = null; String headFinderOption = "-hf"; String[] headFinderArgs = null; String headFinderArgOption = "-hfArg"; String encoding = "UTF-8"; String encodingOption = "-encoding"; if (args.length == 0) { System.err.println( "Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>"); System.exit(0); } String treePrintFormats; String singleLineOption = "-s"; String verboseOption = "-v"; String matchedOption = "-m"; // if set, then print original form of trees that are matched & thus operated on String patternOperationOption = "-po"; String treeFileOption = "-treeFile"; String trfOption = "-trf"; String macroOption = "-macros"; String macroFilename = ""; Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(patternOperationOption, 2); flagMap.put(treeFileOption, 1); flagMap.put(trfOption, 1); flagMap.put(singleLineOption, 0); flagMap.put(encodingOption, 1); flagMap.put(headFinderOption, 1); flagMap.put(macroOption, 1); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); if (argsMap.containsKey(headFinderOption)) headFinderClassName = argsMap.get(headFinderOption)[0]; if (argsMap.containsKey(headFinderArgOption)) headFinderArgs = argsMap.get(headFinderArgOption); if (argsMap.containsKey(verboseOption)) verbose = true; if (argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,"; else treePrintFormats = "penn,"; if (argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0]; if (argsMap.containsKey(macroOption)) macroFilename = argsMap.get(macroOption)[0]; TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack()); PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out, encoding), true); TreeReaderFactory trf; if (argsMap.containsKey(trfOption)) { String trfClass = argsMap.get(trfOption)[0]; trf = ReflectionLoading.loadByReflection(trfClass); } else { trf = new TregexPattern.TRegexTreeReaderFactory(); } Treebank trees = new DiskTreebank(trf, encoding); if (argsMap.containsKey(treeFileOption)) { trees.loadPath(argsMap.get(treeFileOption)[0]); } List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); TregexPatternCompiler compiler; if (headFinderClassName == null) { compiler = new TregexPatternCompiler(); } else { HeadFinder hf; if (headFinderArgs == null) { hf = ReflectionLoading.loadByReflection(headFinderClassName); } else { hf = ReflectionLoading.loadByReflection(headFinderClassName, (Object[]) headFinderArgs); } compiler = new TregexPatternCompiler(hf); } Macros.addAllMacros(compiler, macroFilename, encoding); if (argsMap.containsKey(patternOperationOption)) { TregexPattern matchPattern = compiler.compile(argsMap.get(patternOperationOption)[0]); TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); } else { for (String arg : args) { List<Pair<TregexPattern, TsurgeonPattern>> pairs = getOperationsFromFile(arg, encoding, compiler); for (Pair<TregexPattern, TsurgeonPattern> pair : pairs) { if (verbose) { System.err.println(pair.second()); } ops.add(pair); } } } for (Tree t : trees) { Tree original = t.deepCopy(); Tree result = processPatternsOnTree(ops, t); if (argsMap.containsKey(matchedOption) && matchedOnTree) { pwOut.println("Operated on: "); displayTree(original, tp, pwOut); pwOut.println("Result: "); } displayTree(result, tp, pwOut); } }
public static void main(String[] args) { Options op = new Options(new EnglishTreebankParserParams()); // op.tlpParams may be changed to something else later, so don't use it till // after options are parsed. System.out.println("Currently " + new Date()); System.out.print("Invoked with arguments:"); for (String arg : args) { System.out.print(" " + arg); } System.out.println(); String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj"; int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219; String serializeFile = null; int i = 0; while (i < args.length && args[i].startsWith("-")) { if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) { path = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) { trainLow = Integer.parseInt(args[i + 1]); trainHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) { testLow = Integer.parseInt(args[i + 1]); testHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) { serializeFile = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[i + 1]); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString()); } catch (IllegalAccessException e) { System.err.println("illegal access" + e); } i += 2; } else if (args[i].equals("-encoding")) { // sets encoding for TreebankLangParserParams op.tlpParams.setInputEncoding(args[i + 1]); op.tlpParams.setOutputEncoding(args[i + 1]); i += 2; } else { i = op.setOptionOrWarn(args, i); } } // System.out.println(tlpParams.getClass()); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams); PrintWriter pw = op.tlpParams.pw(); Test.display(); Train.display(); op.display(); op.tlpParams.display(); // setup tree transforms Treebank trainTreebank = op.tlpParams.memoryTreebank(); MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank(); // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank(); // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/"; // blippTreebank.loadPath(blippPath, "", true); Timing.startTime(); System.err.print("Reading trees..."); testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true)); if (Test.increasingLength) { Collections.sort(testTreebank, new TreeLengthComparator()); } trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true)); Timing.tick("done."); System.err.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = null; if (!Train.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true); } else { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !Train.outsideFactor(), true); } CollinsPuncTransformer collinsPuncTransformer = null; if (Train.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlp); } TreeTransformer debinarizer = new Debinarizer(op.forceCNF); List<Tree> binaryTrainTrees = new ArrayList<Tree>(); if (Train.selectiveSplit) { Train.splitters = ParentAnnotationStats.getSplitCategories( trainTreebank, Train.tagSelectiveSplit, 0, Train.selectiveSplitCutOff, Train.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack()); if (Train.deleteSplitters != null) { List<String> deleted = new ArrayList<String>(); for (String del : Train.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } System.err.println("Removed from vertical splitters: " + deleted); } } if (Train.selectivePostSplit) { TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams); Treebank annotatedTB = trainTreebank.transform(myTransformer); Train.postSplitters = ParentAnnotationStats.getSplitCategories( annotatedTB, true, 0, Train.selectivePostSplitCutOff, Train.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack()); } if (Train.hSelSplit) { binarizer.setDoSelectiveSplit(false); for (Tree tree : trainTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } // tree.pennPrint(tlpParams.pw()); tree = binarizer.transformTree(tree); // binaryTrainTrees.add(tree); } binarizer.setDoSelectiveSplit(true); } for (Tree tree : trainTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTrainTrees.add(tree); } if (Test.verbose) { binarizer.dumpStats(); } List<Tree> binaryTestTrees = new ArrayList<Tree>(); for (Tree tree : testTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTestTrees.add(tree); } Timing.tick("done."); // binarization BinaryGrammar bg = null; UnaryGrammar ug = null; DependencyGrammar dg = null; // DependencyGrammar dgBLIPP = null; Lexicon lex = null; // extract grammars Extractor bgExtractor = new BinaryGrammarExtractor(); // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor(); // Extractor lexExtractor = new LexiconExtractor(); // Extractor dgExtractor = new DependencyMemGrammarExtractor(); Extractor dgExtractor = new MLEDependencyGrammarExtractor(op); if (op.doPCFG) { System.err.print("Extracting PCFG..."); Pair bgug = null; if (Train.cheatPCFG) { List allTrees = new ArrayList(binaryTrainTrees); allTrees.addAll(binaryTestTrees); bgug = (Pair) bgExtractor.extract(allTrees); } else { bgug = (Pair) bgExtractor.extract(binaryTrainTrees); } bg = (BinaryGrammar) bgug.second; bg.splitRules(); ug = (UnaryGrammar) bgug.first; ug.purgeRules(); Timing.tick("done."); } System.err.print("Extracting Lexicon..."); lex = op.tlpParams.lex(op.lexOptions); lex.train(binaryTrainTrees); Timing.tick("done."); if (op.doDep) { System.err.print("Extracting Dependencies..."); binaryTrainTrees.clear(); // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams,true)); DependencyGrammar dg1 = (DependencyGrammar) dgExtractor.extract( trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true)); // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new // TransformTreeDependency(tlpParams)); // dg = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams)); // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2); // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether // the words are known or not, discards unknown words Timing.tick("done."); // System.out.print("Extracting Unknown Word Model..."); // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees); // Timing.tick("done."); System.out.print("Tuning Dependency Model..."); dg.tune(binaryTestTrees); // System.out.println("TUNE DEPS: "+tuneDeps); Timing.tick("done."); } BinaryGrammar boundBG = bg; UnaryGrammar boundUG = ug; GrammarProjection gp = new NullGrammarProjection(bg, ug); // serialization if (serializeFile != null) { System.err.print("Serializing parser..."); LexicalizedParser.saveParserDataToSerialized( new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile); Timing.tick("done."); } // test: pcfg-parse and output ExhaustivePCFGParser parser = null; if (op.doPCFG) { parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op); } ExhaustiveDependencyParser dparser = ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null); Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null); // Scorer scorer = parser; BiLexPCFGParser bparser = null; if (op.doPCFG && op.doDep) { bparser = (Test.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp); } LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg PE", true, tlp); LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp); AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg CB", true, tlp); AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg TE"); AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE"); AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE"); AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE"); AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE"); AbstractEval depDE = new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter()); AbstractEval comboDE = new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter()); if (Test.evalb) { EvalB.initEVALBfiles(op.tlpParams); } // int[] countByLength = new int[Test.maxLength+1]; // use a reflection ruse, so one can run this without needing the tagger // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null); SentenceProcessor tagger = null; if (Test.preTag) { try { Class[] argsClass = new Class[] {String.class}; Object[] arguments = new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"}; tagger = (SentenceProcessor) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger") .getConstructor(argsClass) .newInstance(arguments); } catch (Exception e) { System.err.println(e); System.err.println("Warning: No pretagging of sentences will be done."); } } for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) { Tree tree = testTreebank.get(tNum); int testTreeLen = tree.yield().size(); if (testTreeLen > Test.maxLength) { continue; } Tree binaryTree = binaryTestTrees.get(tNum); // countByLength[testTreeLen]++; System.out.println("-------------------------------------"); System.out.println("Number: " + (tNum + 1)); System.out.println("Length: " + testTreeLen); // tree.pennPrint(pw); // System.out.println("XXXX The binary tree is"); // binaryTree.pennPrint(pw); // System.out.println("Here are the tags in the lexicon:"); // System.out.println(lex.showTags()); // System.out.println("Here's the tagnumberer:"); // System.out.println(Numberer.getGlobalNumberer("tags").toString()); long timeMil1 = System.currentTimeMillis(); Timing.tick("Starting parse."); if (op.doPCFG) { // System.err.println(Test.forceTags); if (Test.forceTags) { if (tagger != null) { // System.out.println("Using a tagger to set tags"); // System.out.println("Tagged sentence as: " + // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false)); parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield()))))); } else { // System.out.println("Forcing tags to match input."); parser.parse(cleanTags(binaryTree.taggedYield(), tlp)); } } else { // System.out.println("XXXX Parsing " + binaryTree.yield()); parser.parse(binaryTree.yield()); } // Timing.tick("Done with pcfg phase."); } if (op.doDep) { dparser.parse(binaryTree.yield()); // Timing.tick("Done with dependency phase."); } boolean bothPassed = false; if (op.doPCFG && op.doDep) { bothPassed = bparser.parse(binaryTree.yield()); // Timing.tick("Done with combination phase."); } long timeMil2 = System.currentTimeMillis(); long elapsed = timeMil2 - timeMil1; System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec."); // System.out.println("PCFG Best Parse:"); Tree tree2b = null; Tree tree2 = null; // System.out.println("Got full best parse..."); if (op.doPCFG) { tree2b = parser.getBestParse(); tree2 = debinarizer.transformTree(tree2b); } // System.out.println("Debinarized parse..."); // tree2.pennPrint(); // System.out.println("DepG Best Parse:"); Tree tree3 = null; Tree tree3db = null; if (op.doDep) { tree3 = dparser.getBestParse(); // was: but wrong Tree tree3db = debinarizer.transformTree(tree2); tree3db = debinarizer.transformTree(tree3); tree3.pennPrint(pw); } // tree.pennPrint(); // ((Tree)binaryTrainTrees.get(tNum)).pennPrint(); // System.out.println("Combo Best Parse:"); Tree tree4 = null; if (op.doPCFG && op.doDep) { try { tree4 = bparser.getBestParse(); if (tree4 == null) { tree4 = tree2b; } } catch (NullPointerException e) { System.err.println("Blocked, using PCFG parse!"); tree4 = tree2b; } } if (op.doPCFG && !bothPassed) { tree4 = tree2b; } // tree4.pennPrint(); if (op.doDep) { depDE.evaluate(tree3, binaryTree, pw); depTE.evaluate(tree3db, tree, pw); } TreeTransformer tc = op.tlpParams.collinizer(); TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb(); Tree tree4b = null; if (op.doPCFG) { // System.out.println("XXXX Best PCFG was: "); // tree2.pennPrint(); // System.out.println("XXXX Transformed best PCFG is: "); // tc.transformTree(tree2).pennPrint(); // System.out.println("True Best Parse:"); // tree.pennPrint(); // tc.transformTree(tree).pennPrint(); pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw); tree4b = tree4; tree4 = debinarizer.transformTree(tree4); if (op.nodePrune) { NodePruner np = new NodePruner(parser, debinarizer); tree4 = np.prune(tree4); } // tree4.pennPrint(); comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } // pcfgTE.evaluate(tree2, tree); pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw); pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw); comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0)); // tc.transformTree(tree2).pennPrint(); tree2.pennPrint(pw); if (op.doDep) { System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0)); // tc.transformTree(tree4).pennPrint(pw); tree4.pennPrint(pw); } System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0)); /* if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) { System.out.println("SCORE INVERSION"); parser.validateBinarizedTree(binaryTree,0); } */ tree.pennPrint(pw); } // end if doPCFG if (Test.evalb) { if (op.doPCFG && op.doDep) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4)); } else if (op.doPCFG) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2)); } else if (op.doDep) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db)); } } } // end for each tree in test treebank if (Test.evalb) { EvalB.closeEVALBfiles(); } // Test.display(); if (op.doPCFG) { pcfgPE.display(false, pw); System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total()); pcfgCB.display(false, pw); if (op.doDep) { comboPE.display(false, pw); } pcfgTE.display(false, pw); pcfgTEnoPunct.display(false, pw); if (op.doDep) { comboTE.display(false, pw); comboTEnoPunct.display(false, pw); } } if (op.doDep) { depTE.display(false, pw); depDE.display(false, pw); } if (op.doPCFG && op.doDep) { comboDE.display(false, pw); } // pcfgPE.printGoodBad(); }