private static void extractSubtrees(List<String> codeStrings, String treeFile) { List<Pair<Integer, Integer>> codes = new ArrayList<Pair<Integer, Integer>>(); for (String s : codeStrings) { Matcher m = codePattern.matcher(s); if (m.matches()) codes.add( new Pair<Integer, Integer>(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)))); else throw new RuntimeException("Error: illegal node code " + s); } TreeReaderFactory trf = new TRegexTreeReaderFactory(); MemoryTreebank treebank = new MemoryTreebank(trf); treebank.loadPath(treeFile, null, true); for (Pair<Integer, Integer> code : codes) { Tree t = treebank.get(code.first() - 1); t.getNodeNumber(code.second()).pennPrint(); } }
public static void main(String[] args) { Options op = new Options(new EnglishTreebankParserParams()); // op.tlpParams may be changed to something else later, so don't use it till // after options are parsed. System.out.println("Currently " + new Date()); System.out.print("Invoked with arguments:"); for (String arg : args) { System.out.print(" " + arg); } System.out.println(); String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj"; int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219; String serializeFile = null; int i = 0; while (i < args.length && args[i].startsWith("-")) { if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) { path = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) { trainLow = Integer.parseInt(args[i + 1]); trainHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) { testLow = Integer.parseInt(args[i + 1]); testHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) { serializeFile = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[i + 1]); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString()); } catch (IllegalAccessException e) { System.err.println("illegal access" + e); } i += 2; } else if (args[i].equals("-encoding")) { // sets encoding for TreebankLangParserParams op.tlpParams.setInputEncoding(args[i + 1]); op.tlpParams.setOutputEncoding(args[i + 1]); i += 2; } else { i = op.setOptionOrWarn(args, i); } } // System.out.println(tlpParams.getClass()); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams); PrintWriter pw = op.tlpParams.pw(); Test.display(); Train.display(); op.display(); op.tlpParams.display(); // setup tree transforms Treebank trainTreebank = op.tlpParams.memoryTreebank(); MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank(); // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank(); // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/"; // blippTreebank.loadPath(blippPath, "", true); Timing.startTime(); System.err.print("Reading trees..."); testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true)); if (Test.increasingLength) { Collections.sort(testTreebank, new TreeLengthComparator()); } trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true)); Timing.tick("done."); System.err.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = null; if (!Train.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true); } else { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !Train.outsideFactor(), true); } CollinsPuncTransformer collinsPuncTransformer = null; if (Train.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlp); } TreeTransformer debinarizer = new Debinarizer(op.forceCNF); List<Tree> binaryTrainTrees = new ArrayList<Tree>(); if (Train.selectiveSplit) { Train.splitters = ParentAnnotationStats.getSplitCategories( trainTreebank, Train.tagSelectiveSplit, 0, Train.selectiveSplitCutOff, Train.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack()); if (Train.deleteSplitters != null) { List<String> deleted = new ArrayList<String>(); for (String del : Train.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } System.err.println("Removed from vertical splitters: " + deleted); } } if (Train.selectivePostSplit) { TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams); Treebank annotatedTB = trainTreebank.transform(myTransformer); Train.postSplitters = ParentAnnotationStats.getSplitCategories( annotatedTB, true, 0, Train.selectivePostSplitCutOff, Train.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack()); } if (Train.hSelSplit) { binarizer.setDoSelectiveSplit(false); for (Tree tree : trainTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } // tree.pennPrint(tlpParams.pw()); tree = binarizer.transformTree(tree); // binaryTrainTrees.add(tree); } binarizer.setDoSelectiveSplit(true); } for (Tree tree : trainTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTrainTrees.add(tree); } if (Test.verbose) { binarizer.dumpStats(); } List<Tree> binaryTestTrees = new ArrayList<Tree>(); for (Tree tree : testTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTestTrees.add(tree); } Timing.tick("done."); // binarization BinaryGrammar bg = null; UnaryGrammar ug = null; DependencyGrammar dg = null; // DependencyGrammar dgBLIPP = null; Lexicon lex = null; // extract grammars Extractor bgExtractor = new BinaryGrammarExtractor(); // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor(); // Extractor lexExtractor = new LexiconExtractor(); // Extractor dgExtractor = new DependencyMemGrammarExtractor(); Extractor dgExtractor = new MLEDependencyGrammarExtractor(op); if (op.doPCFG) { System.err.print("Extracting PCFG..."); Pair bgug = null; if (Train.cheatPCFG) { List allTrees = new ArrayList(binaryTrainTrees); allTrees.addAll(binaryTestTrees); bgug = (Pair) bgExtractor.extract(allTrees); } else { bgug = (Pair) bgExtractor.extract(binaryTrainTrees); } bg = (BinaryGrammar) bgug.second; bg.splitRules(); ug = (UnaryGrammar) bgug.first; ug.purgeRules(); Timing.tick("done."); } System.err.print("Extracting Lexicon..."); lex = op.tlpParams.lex(op.lexOptions); lex.train(binaryTrainTrees); Timing.tick("done."); if (op.doDep) { System.err.print("Extracting Dependencies..."); binaryTrainTrees.clear(); // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams,true)); DependencyGrammar dg1 = (DependencyGrammar) dgExtractor.extract( trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true)); // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new // TransformTreeDependency(tlpParams)); // dg = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams)); // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2); // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether // the words are known or not, discards unknown words Timing.tick("done."); // System.out.print("Extracting Unknown Word Model..."); // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees); // Timing.tick("done."); System.out.print("Tuning Dependency Model..."); dg.tune(binaryTestTrees); // System.out.println("TUNE DEPS: "+tuneDeps); Timing.tick("done."); } BinaryGrammar boundBG = bg; UnaryGrammar boundUG = ug; GrammarProjection gp = new NullGrammarProjection(bg, ug); // serialization if (serializeFile != null) { System.err.print("Serializing parser..."); LexicalizedParser.saveParserDataToSerialized( new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile); Timing.tick("done."); } // test: pcfg-parse and output ExhaustivePCFGParser parser = null; if (op.doPCFG) { parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op); } ExhaustiveDependencyParser dparser = ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null); Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null); // Scorer scorer = parser; BiLexPCFGParser bparser = null; if (op.doPCFG && op.doDep) { bparser = (Test.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp); } LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg PE", true, tlp); LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp); AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg CB", true, tlp); AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg TE"); AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE"); AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE"); AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE"); AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE"); AbstractEval depDE = new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter()); AbstractEval comboDE = new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter()); if (Test.evalb) { EvalB.initEVALBfiles(op.tlpParams); } // int[] countByLength = new int[Test.maxLength+1]; // use a reflection ruse, so one can run this without needing the tagger // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null); SentenceProcessor tagger = null; if (Test.preTag) { try { Class[] argsClass = new Class[] {String.class}; Object[] arguments = new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"}; tagger = (SentenceProcessor) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger") .getConstructor(argsClass) .newInstance(arguments); } catch (Exception e) { System.err.println(e); System.err.println("Warning: No pretagging of sentences will be done."); } } for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) { Tree tree = testTreebank.get(tNum); int testTreeLen = tree.yield().size(); if (testTreeLen > Test.maxLength) { continue; } Tree binaryTree = binaryTestTrees.get(tNum); // countByLength[testTreeLen]++; System.out.println("-------------------------------------"); System.out.println("Number: " + (tNum + 1)); System.out.println("Length: " + testTreeLen); // tree.pennPrint(pw); // System.out.println("XXXX The binary tree is"); // binaryTree.pennPrint(pw); // System.out.println("Here are the tags in the lexicon:"); // System.out.println(lex.showTags()); // System.out.println("Here's the tagnumberer:"); // System.out.println(Numberer.getGlobalNumberer("tags").toString()); long timeMil1 = System.currentTimeMillis(); Timing.tick("Starting parse."); if (op.doPCFG) { // System.err.println(Test.forceTags); if (Test.forceTags) { if (tagger != null) { // System.out.println("Using a tagger to set tags"); // System.out.println("Tagged sentence as: " + // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false)); parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield()))))); } else { // System.out.println("Forcing tags to match input."); parser.parse(cleanTags(binaryTree.taggedYield(), tlp)); } } else { // System.out.println("XXXX Parsing " + binaryTree.yield()); parser.parse(binaryTree.yield()); } // Timing.tick("Done with pcfg phase."); } if (op.doDep) { dparser.parse(binaryTree.yield()); // Timing.tick("Done with dependency phase."); } boolean bothPassed = false; if (op.doPCFG && op.doDep) { bothPassed = bparser.parse(binaryTree.yield()); // Timing.tick("Done with combination phase."); } long timeMil2 = System.currentTimeMillis(); long elapsed = timeMil2 - timeMil1; System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec."); // System.out.println("PCFG Best Parse:"); Tree tree2b = null; Tree tree2 = null; // System.out.println("Got full best parse..."); if (op.doPCFG) { tree2b = parser.getBestParse(); tree2 = debinarizer.transformTree(tree2b); } // System.out.println("Debinarized parse..."); // tree2.pennPrint(); // System.out.println("DepG Best Parse:"); Tree tree3 = null; Tree tree3db = null; if (op.doDep) { tree3 = dparser.getBestParse(); // was: but wrong Tree tree3db = debinarizer.transformTree(tree2); tree3db = debinarizer.transformTree(tree3); tree3.pennPrint(pw); } // tree.pennPrint(); // ((Tree)binaryTrainTrees.get(tNum)).pennPrint(); // System.out.println("Combo Best Parse:"); Tree tree4 = null; if (op.doPCFG && op.doDep) { try { tree4 = bparser.getBestParse(); if (tree4 == null) { tree4 = tree2b; } } catch (NullPointerException e) { System.err.println("Blocked, using PCFG parse!"); tree4 = tree2b; } } if (op.doPCFG && !bothPassed) { tree4 = tree2b; } // tree4.pennPrint(); if (op.doDep) { depDE.evaluate(tree3, binaryTree, pw); depTE.evaluate(tree3db, tree, pw); } TreeTransformer tc = op.tlpParams.collinizer(); TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb(); Tree tree4b = null; if (op.doPCFG) { // System.out.println("XXXX Best PCFG was: "); // tree2.pennPrint(); // System.out.println("XXXX Transformed best PCFG is: "); // tc.transformTree(tree2).pennPrint(); // System.out.println("True Best Parse:"); // tree.pennPrint(); // tc.transformTree(tree).pennPrint(); pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw); tree4b = tree4; tree4 = debinarizer.transformTree(tree4); if (op.nodePrune) { NodePruner np = new NodePruner(parser, debinarizer); tree4 = np.prune(tree4); } // tree4.pennPrint(); comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } // pcfgTE.evaluate(tree2, tree); pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw); pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw); comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0)); // tc.transformTree(tree2).pennPrint(); tree2.pennPrint(pw); if (op.doDep) { System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0)); // tc.transformTree(tree4).pennPrint(pw); tree4.pennPrint(pw); } System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0)); /* if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) { System.out.println("SCORE INVERSION"); parser.validateBinarizedTree(binaryTree,0); } */ tree.pennPrint(pw); } // end if doPCFG if (Test.evalb) { if (op.doPCFG && op.doDep) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4)); } else if (op.doPCFG) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2)); } else if (op.doDep) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db)); } } } // end for each tree in test treebank if (Test.evalb) { EvalB.closeEVALBfiles(); } // Test.display(); if (op.doPCFG) { pcfgPE.display(false, pw); System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total()); pcfgCB.display(false, pw); if (op.doDep) { comboPE.display(false, pw); } pcfgTE.display(false, pw); pcfgTEnoPunct.display(false, pw); if (op.doDep) { comboTE.display(false, pw); comboTEnoPunct.display(false, pw); } } if (op.doDep) { depTE.display(false, pw); depDE.display(false, pw); } if (op.doPCFG && op.doDep) { comboDE.display(false, pw); } // pcfgPE.printGoodBad(); }
/** * Prints out all matches of a tree pattern on each tree in the path. Usage: <br> * <br> * <code> * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h <node-name>]]* pattern * filepath </code> * * <p>Arguments:<br> * * <ul> * <li><code>pattern</code>: the tree pattern which optionally names some set of nodes (i.e., * gives it the "handle") <code>=name</code> (for some arbitrary string "name") * <li><code>filepath</code>: the path to files with trees. If this is a directory, there will * be recursive descent and the pattern will be run on all files beneath the specified * directory. * </ul> * * <p>Options:<br> * <li><code>-C</code> suppresses printing of matches, so only the number of matches is printed. * <li><code>-w</code> causes the whole of a tree that matches to be printed. * <li><code>-f</code> causes the filename to be printed. * <li><code>-i <filename></code> causes the pattern to be matched to be read from <code> * <filename></code> rather than the command line. Don't specify a pattern when this * option is used. * <li><code>-o</code> Specifies that each tree node can be reported only once as the root of a * match (by default a node will be printed once for every <em>way</em> the pattern matches). * <li><code>-s</code> causes trees to be printed all on one line (by default they are pretty * printed). * <li><code>-n</code> causes the number of the tree in which the match was found to be printed * before every match. * <li><code>-u</code> causes only the label of each matching node to be printed, not complete * subtrees. * <li><code>-t</code> causes only the yield (terminal words) of the selected node to be printed * (or the yield of the whole tree, if the <code>-w</code> option is used). * <li><code>-encoding <charset_encoding></code> option allows specification of character * encoding of trees.. * <li><code>-h <node-handle></code> If a <code>-h</code> option is given, the root tree * node will not be printed. Instead, for each <code>node-handle</code> specified, the node * matched and given that handle will be printed. Multiple nodes can be printed by using the * <code>-h</code> option multiple times on a single command line. * <li><code>-hf <headfinder-class-name></code> use the specified {@link HeadFinder} class * to determine headship relations. * <li><code>-hfArg <string></code> pass a string argument in to the {@link HeadFinder} * class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple * arguments. * <li><code>-trf <TreeReaderFactory-class-name></code> use the specified {@link * TreeReaderFactory} class to read trees from files. * <li><code>-v</code> print every tree that contains no matches of the specified pattern, but * print no matches to the pattern. * <li><code>-x</code> Instead of the matched subtree, print the matched subtree's identifying * number as defined in <tt>tgrep2</tt>:a unique identifier for the subtree and is in the form * s:n, where s is an integer specifying the sentence number in the corpus (starting with 1), * and n is an integer giving the order in which the node is encountered in a depth-first * search starting with 1 at top node in the sentence tree. * <li><code>-extract <code> <tree-file></code> extracts the subtree s:n specified by * <tt>code</tt> from the specified <tt>tree-file</tt>. Overrides all other behavior of * tregex. Can't specify multiple encodings etc. yet. * <li><code>-extractFile <code-file> <tree-file></code> extracts every subtree * specified by the subtree codes in <tt>code-file</tt>, which must appear exactly one per * line, from the specified <tt>tree-file</tt>. Overrides all other behavior of tregex. Can't * specify multiple encodings etc. yet. * <li><code>-filter</code> causes this to act as a filter, reading tree input from stdin * <li><code>-T</code> causes all trees to be printed as processed (for debugging purposes). * Otherwise only matching nodes are printed. * <li><code>-macros <filename></code> filename with macro substitutions to use. file with * tab separated lines original-tab-replacement * </ul> */ public static void main(String[] args) throws IOException { Timing.startTime(); StringBuilder treePrintFormats = new StringBuilder(); String printNonMatchingTreesOption = "-v"; String subtreeCodeOption = "-x"; String extractSubtreesOption = "-extract"; String extractSubtreesFileOption = "-extractFile"; String inputFileOption = "-i"; String headFinderOption = "-hf"; String headFinderArgOption = "-hfArg"; String trfOption = "-trf"; String headFinderClassName = null; String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY; String treeReaderFactoryClassName = null; String printHandleOption = "-h"; String markHandleOption = "-k"; String encodingOption = "-encoding"; String encoding = "UTF-8"; String macroOption = "-macros"; String macroFilename = ""; String yieldOnly = "-t"; String printAllTrees = "-T"; String quietMode = "-C"; String wholeTreeMode = "-w"; String filenameOption = "-f"; String oneMatchPerRootNodeMode = "-o"; String reportTreeNumbers = "-n"; String rootLabelOnly = "-u"; String oneLine = "-s"; Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(extractSubtreesOption, 2); flagMap.put(extractSubtreesFileOption, 2); flagMap.put(subtreeCodeOption, 0); flagMap.put(printNonMatchingTreesOption, 0); flagMap.put(encodingOption, 1); flagMap.put(inputFileOption, 1); flagMap.put(printHandleOption, 1); flagMap.put(markHandleOption, 2); flagMap.put(headFinderOption, 1); flagMap.put(headFinderArgOption, 1); flagMap.put(trfOption, 1); flagMap.put(macroOption, 1); flagMap.put(yieldOnly, 0); flagMap.put(quietMode, 0); flagMap.put(wholeTreeMode, 0); flagMap.put(printAllTrees, 0); flagMap.put(filenameOption, 0); flagMap.put(oneMatchPerRootNodeMode, 0); flagMap.put(reportTreeNumbers, 0); flagMap.put(rootLabelOnly, 0); flagMap.put(oneLine, 0); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); if (argsMap.containsKey(encodingOption)) { encoding = argsMap.get(encodingOption)[0]; System.err.println("Encoding set to " + encoding); } PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true); if (argsMap.containsKey(extractSubtreesOption)) { List<String> subTreeStrings = Collections.singletonList(argsMap.get(extractSubtreesOption)[0]); extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]); return; } if (argsMap.containsKey(extractSubtreesFileOption)) { List<String> subTreeStrings = Arrays.asList( IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r")); extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]); return; } if (args.length < 1) { errPW.println( "Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-f] [-o] [-n] [-s] [-filter] [-hf class] [-trf class] [-h handle]* pattern [filepath]"); return; } String matchString = args[0]; if (argsMap.containsKey(macroOption)) { macroFilename = argsMap.get(macroOption)[0]; } if (argsMap.containsKey(headFinderOption)) { headFinderClassName = argsMap.get(headFinderOption)[0]; errPW.println("Using head finder " + headFinderClassName + "..."); } if (argsMap.containsKey(headFinderArgOption)) { headFinderArgs = argsMap.get(headFinderArgOption); } if (argsMap.containsKey(trfOption)) { treeReaderFactoryClassName = argsMap.get(trfOption)[0]; errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "..."); } if (argsMap.containsKey(printAllTrees)) { TRegexTreeVisitor.printTree = true; } if (argsMap.containsKey(inputFileOption)) { String inputFile = argsMap.get(inputFileOption)[0]; matchString = IOUtils.slurpFile(inputFile, encoding); String[] newArgs = new String[args.length + 1]; System.arraycopy(args, 0, newArgs, 1, args.length); args = newArgs; } if (argsMap.containsKey(quietMode)) { TRegexTreeVisitor.printMatches = false; TRegexTreeVisitor.printNumMatchesToStdOut = true; } if (argsMap.containsKey(printNonMatchingTreesOption)) { TRegexTreeVisitor.printNonMatchingTrees = true; } if (argsMap.containsKey(subtreeCodeOption)) { TRegexTreeVisitor.printSubtreeCode = true; TRegexTreeVisitor.printMatches = false; } if (argsMap.containsKey(wholeTreeMode)) { TRegexTreeVisitor.printWholeTree = true; } if (argsMap.containsKey(filenameOption)) { TRegexTreeVisitor.printFilename = true; } if (argsMap.containsKey(oneMatchPerRootNodeMode)) TRegexTreeVisitor.oneMatchPerRootNode = true; if (argsMap.containsKey(reportTreeNumbers)) TRegexTreeVisitor.reportTreeNumbers = true; if (argsMap.containsKey(rootLabelOnly)) { treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(','); } else if (argsMap.containsKey(oneLine)) { // display short form treePrintFormats.append("oneline,"); } else if (argsMap.containsKey(yieldOnly)) { treePrintFormats.append("words,"); } else { treePrintFormats.append("penn,"); } HeadFinder hf = new CollinsHeadFinder(); if (headFinderClassName != null) { Class[] hfArgClasses = new Class[headFinderArgs.length]; for (int i = 0; i < hfArgClasses.length; i++) hfArgClasses[i] = String.class; try { hf = (HeadFinder) Class.forName(headFinderClassName) .getConstructor(hfArgClasses) .newInstance( (Object[]) headFinderArgs); // cast to Object[] necessary to avoid varargs-related // warning. } catch (Exception e) { throw new RuntimeException("Error occurred while constructing HeadFinder: " + e); } } TRegexTreeVisitor.tp = new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack()); try { // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``"); TregexPatternCompiler tpc = new TregexPatternCompiler(hf); Macros.addAllMacros(tpc, macroFilename, encoding); TregexPattern p = tpc.compile(matchString); errPW.println("Pattern string:\n" + p.pattern()); errPW.println("Parsed representation:"); p.prettyPrint(errPW); String[] handles = argsMap.get(printHandleOption); if (argsMap.containsKey("-filter")) { TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName); treebank = new MemoryTreebank( trf, encoding); // has to be in memory since we're not storing it on disk // read from stdin Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding)); ((MemoryTreebank) treebank).load(reader); reader.close(); } else if (args.length == 1) { errPW.println("using default tree"); TreeReader r = new PennTreeReader( new StringReader( "(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"), new LabeledScoredTreeFactory(new StringLabelFactory())); Tree t = r.readTree(); treebank = new MemoryTreebank(); treebank.add(t); } else { int last = args.length - 1; errPW.println("Reading trees from file(s) " + args[last]); TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName); treebank = new DiskTreebank(trf, encoding); treebank.loadPath(args[last], null, true); } TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding); treebank.apply(vis); Timing.endTime(); if (TRegexTreeVisitor.printMatches) { errPW.println("There were " + vis.numMatches() + " matches in total."); } if (TRegexTreeVisitor.printNumMatchesToStdOut) { System.out.println(vis.numMatches()); } } catch (IOException e) { e.printStackTrace(); } catch (TregexParseException e) { errPW.println("Error parsing expression: " + args[0]); errPW.println("Parse exception: " + e.toString()); } }