public static void main(String[] args) { QPTreeTransformer transformer = new QPTreeTransformer(); Treebank tb = new MemoryTreebank(); Properties props = StringUtils.argsToProperties(args); String treeFileName = props.getProperty("treeFile"); if (treeFileName != null) { try { TreeReader tr = new PennTreeReader( new BufferedReader(new InputStreamReader(new FileInputStream(treeFileName))), new LabeledScoredTreeFactory()); Tree t; while ((t = tr.readTree()) != null) { tb.add(t); } } catch (IOException e) { throw new RuntimeException("File problem: " + e); } } for (Tree t : tb) { System.out.println("Original tree"); t.pennPrint(); System.out.println(); System.out.println("Tree transformed"); Tree tree = transformer.transformTree(t); tree.pennPrint(); System.out.println(); System.out.println("----------------------------"); } }
/** * Load a collection of parse trees from a Reader. Each tree may optionally be encased in parens * to allow for Penn Treebank style trees. * * @param r The reader to read trees from. (If you want it buffered, you should already have * buffered it!) * @param id An ID for where these files come from (arbitrary, but something like a filename. Can * be <code>null</code> for none. */ public void load(Reader r, String id) { try { // could throw an IO exception? TreeReader tr = treeReaderFactory().newTreeReader(r); int sentIndex = 0; for (Tree pt; (pt = tr.readTree()) != null; ) { if (pt.label() instanceof HasIndex) { // so we can trace where this tree came from HasIndex hi = (HasIndex) pt.label(); if (id != null) { hi.setDocID(id); } hi.setSentIndex(sentIndex); } parseTrees.add(pt); sentIndex++; } } catch (IOException e) { System.err.println("load IO Exception: " + e); } }
/** * Load a collection of parse trees from the file of given name. Each tree may optionally be * encased in parens to allow for Penn Treebank style trees. This methods implements the <code> * FileProcessor</code> interface. * * @param file file to load a tree from */ public void processFile(File file) { TreeReader tr = null; // SRL stuff CollectionValuedMap<Integer, String> srlMap = null; if (this.srlMap != null) { // there must be a better way ... String filename = file.getAbsolutePath(); for (String suffix : this.srlMap.keySet()) { if (filename.endsWith(suffix)) { srlMap = this.srlMap.get(suffix); break; } } if (srlMap == null) { System.err.println("could not find SRL entries for file: " + file); } } try { // maybe print file name to stdout to get some feedback if (PRINT_FILENAMES) { System.err.println(file); } // could throw an IO exception if can't open for reading tr = treeReaderFactory() .newTreeReader( new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding()))); int sentIndex = 0; Tree pt; while ((pt = tr.readTree()) != null) { if (pt.label() instanceof HasIndex) { // so we can trace where this tree came from HasIndex hi = (HasIndex) pt.label(); hi.setDocID(file.getName()); hi.setSentIndex(sentIndex); } if (srlMap == null) { parseTrees.add(pt); } else { Collection<String> srls = srlMap.get(sentIndex); // pt.pennPrint(); // System.err.println(srls); parseTrees.add(pt); if (srls.isEmpty()) { // parseTrees.add(pt); } else { for (String srl : srls) { // Tree t = pt.deepCopy(); String[] bits = srl.split("\\s+"); int verbIndex = Integer.parseInt(bits[0]); String lemma = bits[2].split("\\.")[0]; // Tree verb = Trees.getTerminal(t, verbIndex); Tree verb = Trees.getTerminal(pt, verbIndex); // ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL); ((CoreLabel) verb.label()).set(CoreAnnotations.CoNLLPredicateAnnotation.class, true); for (int i = 4; i < bits.length; i++) { String arg = bits[i]; String[] bits1; if (arg.indexOf("ARGM") >= 0) { bits1 = arg.split("-"); } else { bits1 = arg.split("-"); } String locs = bits1[0]; String argType = bits1[1]; if (argType.equals("rel")) { continue; } for (String loc : locs.split("[*,]")) { bits1 = loc.split(":"); int term = Integer.parseInt(bits1[0]); int height = Integer.parseInt(bits1[1]); // Tree t1 = Trees.getPreTerminal(t, term); Tree t1 = Trees.getPreTerminal(pt, term); for (int j = 0; j < height; j++) { // t1 = t1.parent(t); t1 = t1.parent(pt); } Map<Integer, String> roleMap = ((CoreLabel) t1.label()).get(CoreAnnotations.CoNLLSRLAnnotation.class); if (roleMap == null) { roleMap = new HashMap<Integer, String>(); ((CoreLabel) t1.label()).set(CoreAnnotations.CoNLLSRLAnnotation.class, roleMap); } roleMap.put(verbIndex, argType); // ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, // SRL_ID.ARG); } } // for (Tree t1 : t) { // if (t1.isLeaf()) { continue; } // CoreLabel fl = (CoreLabel)t1.label(); // if (fl.value() == null) { continue; } // if (!fl.has(SRLIDAnnotation.class)) { // boolean allNone = true; // for (Tree t2 : t1) { // SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class); // if (s == SRL_ID.ARG || s == SRL_ID.REL) { // allNone = false; // break; // } // } // if (allNone) { // fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO); // } else { // fl.set(SRLIDAnnotation.class, SRL_ID.NO); // } // } // } // parseTrees.add(t); } } } sentIndex++; } } catch (IOException e) { System.err.println("loadTree IO Exception: " + e + " in file " + file); } finally { try { if (tr != null) { tr.close(); // important: closes file even if error! } } catch (IOException e) { // do nothin' } } }
/** * Prints out all matches of a tree pattern on each tree in the path. Usage: <br> * <br> * <code> * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h <node-name>]]* pattern * filepath </code> * * <p>Arguments:<br> * * <ul> * <li><code>pattern</code>: the tree pattern which optionally names some set of nodes (i.e., * gives it the "handle") <code>=name</code> (for some arbitrary string "name") * <li><code>filepath</code>: the path to files with trees. If this is a directory, there will * be recursive descent and the pattern will be run on all files beneath the specified * directory. * </ul> * * <p>Options:<br> * <li><code>-C</code> suppresses printing of matches, so only the number of matches is printed. * <li><code>-w</code> causes the whole of a tree that matches to be printed. * <li><code>-f</code> causes the filename to be printed. * <li><code>-i <filename></code> causes the pattern to be matched to be read from <code> * <filename></code> rather than the command line. Don't specify a pattern when this * option is used. * <li><code>-o</code> Specifies that each tree node can be reported only once as the root of a * match (by default a node will be printed once for every <em>way</em> the pattern matches). * <li><code>-s</code> causes trees to be printed all on one line (by default they are pretty * printed). * <li><code>-n</code> causes the number of the tree in which the match was found to be printed * before every match. * <li><code>-u</code> causes only the label of each matching node to be printed, not complete * subtrees. * <li><code>-t</code> causes only the yield (terminal words) of the selected node to be printed * (or the yield of the whole tree, if the <code>-w</code> option is used). * <li><code>-encoding <charset_encoding></code> option allows specification of character * encoding of trees.. * <li><code>-h <node-handle></code> If a <code>-h</code> option is given, the root tree * node will not be printed. Instead, for each <code>node-handle</code> specified, the node * matched and given that handle will be printed. Multiple nodes can be printed by using the * <code>-h</code> option multiple times on a single command line. * <li><code>-hf <headfinder-class-name></code> use the specified {@link HeadFinder} class * to determine headship relations. * <li><code>-hfArg <string></code> pass a string argument in to the {@link HeadFinder} * class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple * arguments. * <li><code>-trf <TreeReaderFactory-class-name></code> use the specified {@link * TreeReaderFactory} class to read trees from files. * <li><code>-v</code> print every tree that contains no matches of the specified pattern, but * print no matches to the pattern. * <li><code>-x</code> Instead of the matched subtree, print the matched subtree's identifying * number as defined in <tt>tgrep2</tt>:a unique identifier for the subtree and is in the form * s:n, where s is an integer specifying the sentence number in the corpus (starting with 1), * and n is an integer giving the order in which the node is encountered in a depth-first * search starting with 1 at top node in the sentence tree. * <li><code>-extract <code> <tree-file></code> extracts the subtree s:n specified by * <tt>code</tt> from the specified <tt>tree-file</tt>. Overrides all other behavior of * tregex. Can't specify multiple encodings etc. yet. * <li><code>-extractFile <code-file> <tree-file></code> extracts every subtree * specified by the subtree codes in <tt>code-file</tt>, which must appear exactly one per * line, from the specified <tt>tree-file</tt>. Overrides all other behavior of tregex. Can't * specify multiple encodings etc. yet. * <li><code>-filter</code> causes this to act as a filter, reading tree input from stdin * <li><code>-T</code> causes all trees to be printed as processed (for debugging purposes). * Otherwise only matching nodes are printed. * <li><code>-macros <filename></code> filename with macro substitutions to use. file with * tab separated lines original-tab-replacement * </ul> */ public static void main(String[] args) throws IOException { Timing.startTime(); StringBuilder treePrintFormats = new StringBuilder(); String printNonMatchingTreesOption = "-v"; String subtreeCodeOption = "-x"; String extractSubtreesOption = "-extract"; String extractSubtreesFileOption = "-extractFile"; String inputFileOption = "-i"; String headFinderOption = "-hf"; String headFinderArgOption = "-hfArg"; String trfOption = "-trf"; String headFinderClassName = null; String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY; String treeReaderFactoryClassName = null; String printHandleOption = "-h"; String markHandleOption = "-k"; String encodingOption = "-encoding"; String encoding = "UTF-8"; String macroOption = "-macros"; String macroFilename = ""; String yieldOnly = "-t"; String printAllTrees = "-T"; String quietMode = "-C"; String wholeTreeMode = "-w"; String filenameOption = "-f"; String oneMatchPerRootNodeMode = "-o"; String reportTreeNumbers = "-n"; String rootLabelOnly = "-u"; String oneLine = "-s"; Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(extractSubtreesOption, 2); flagMap.put(extractSubtreesFileOption, 2); flagMap.put(subtreeCodeOption, 0); flagMap.put(printNonMatchingTreesOption, 0); flagMap.put(encodingOption, 1); flagMap.put(inputFileOption, 1); flagMap.put(printHandleOption, 1); flagMap.put(markHandleOption, 2); flagMap.put(headFinderOption, 1); flagMap.put(headFinderArgOption, 1); flagMap.put(trfOption, 1); flagMap.put(macroOption, 1); flagMap.put(yieldOnly, 0); flagMap.put(quietMode, 0); flagMap.put(wholeTreeMode, 0); flagMap.put(printAllTrees, 0); flagMap.put(filenameOption, 0); flagMap.put(oneMatchPerRootNodeMode, 0); flagMap.put(reportTreeNumbers, 0); flagMap.put(rootLabelOnly, 0); flagMap.put(oneLine, 0); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); if (argsMap.containsKey(encodingOption)) { encoding = argsMap.get(encodingOption)[0]; System.err.println("Encoding set to " + encoding); } PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true); if (argsMap.containsKey(extractSubtreesOption)) { List<String> subTreeStrings = Collections.singletonList(argsMap.get(extractSubtreesOption)[0]); extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]); return; } if (argsMap.containsKey(extractSubtreesFileOption)) { List<String> subTreeStrings = Arrays.asList( IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r")); extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]); return; } if (args.length < 1) { errPW.println( "Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-f] [-o] [-n] [-s] [-filter] [-hf class] [-trf class] [-h handle]* pattern [filepath]"); return; } String matchString = args[0]; if (argsMap.containsKey(macroOption)) { macroFilename = argsMap.get(macroOption)[0]; } if (argsMap.containsKey(headFinderOption)) { headFinderClassName = argsMap.get(headFinderOption)[0]; errPW.println("Using head finder " + headFinderClassName + "..."); } if (argsMap.containsKey(headFinderArgOption)) { headFinderArgs = argsMap.get(headFinderArgOption); } if (argsMap.containsKey(trfOption)) { treeReaderFactoryClassName = argsMap.get(trfOption)[0]; errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "..."); } if (argsMap.containsKey(printAllTrees)) { TRegexTreeVisitor.printTree = true; } if (argsMap.containsKey(inputFileOption)) { String inputFile = argsMap.get(inputFileOption)[0]; matchString = IOUtils.slurpFile(inputFile, encoding); String[] newArgs = new String[args.length + 1]; System.arraycopy(args, 0, newArgs, 1, args.length); args = newArgs; } if (argsMap.containsKey(quietMode)) { TRegexTreeVisitor.printMatches = false; TRegexTreeVisitor.printNumMatchesToStdOut = true; } if (argsMap.containsKey(printNonMatchingTreesOption)) { TRegexTreeVisitor.printNonMatchingTrees = true; } if (argsMap.containsKey(subtreeCodeOption)) { TRegexTreeVisitor.printSubtreeCode = true; TRegexTreeVisitor.printMatches = false; } if (argsMap.containsKey(wholeTreeMode)) { TRegexTreeVisitor.printWholeTree = true; } if (argsMap.containsKey(filenameOption)) { TRegexTreeVisitor.printFilename = true; } if (argsMap.containsKey(oneMatchPerRootNodeMode)) TRegexTreeVisitor.oneMatchPerRootNode = true; if (argsMap.containsKey(reportTreeNumbers)) TRegexTreeVisitor.reportTreeNumbers = true; if (argsMap.containsKey(rootLabelOnly)) { treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(','); } else if (argsMap.containsKey(oneLine)) { // display short form treePrintFormats.append("oneline,"); } else if (argsMap.containsKey(yieldOnly)) { treePrintFormats.append("words,"); } else { treePrintFormats.append("penn,"); } HeadFinder hf = new CollinsHeadFinder(); if (headFinderClassName != null) { Class[] hfArgClasses = new Class[headFinderArgs.length]; for (int i = 0; i < hfArgClasses.length; i++) hfArgClasses[i] = String.class; try { hf = (HeadFinder) Class.forName(headFinderClassName) .getConstructor(hfArgClasses) .newInstance( (Object[]) headFinderArgs); // cast to Object[] necessary to avoid varargs-related // warning. } catch (Exception e) { throw new RuntimeException("Error occurred while constructing HeadFinder: " + e); } } TRegexTreeVisitor.tp = new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack()); try { // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``"); TregexPatternCompiler tpc = new TregexPatternCompiler(hf); Macros.addAllMacros(tpc, macroFilename, encoding); TregexPattern p = tpc.compile(matchString); errPW.println("Pattern string:\n" + p.pattern()); errPW.println("Parsed representation:"); p.prettyPrint(errPW); String[] handles = argsMap.get(printHandleOption); if (argsMap.containsKey("-filter")) { TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName); treebank = new MemoryTreebank( trf, encoding); // has to be in memory since we're not storing it on disk // read from stdin Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding)); ((MemoryTreebank) treebank).load(reader); reader.close(); } else if (args.length == 1) { errPW.println("using default tree"); TreeReader r = new PennTreeReader( new StringReader( "(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"), new LabeledScoredTreeFactory(new StringLabelFactory())); Tree t = r.readTree(); treebank = new MemoryTreebank(); treebank.add(t); } else { int last = args.length - 1; errPW.println("Reading trees from file(s) " + args[last]); TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName); treebank = new DiskTreebank(trf, encoding); treebank.loadPath(args[last], null, true); } TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding); treebank.apply(vis); Timing.endTime(); if (TRegexTreeVisitor.printMatches) { errPW.println("There were " + vis.numMatches() + " matches in total."); } if (TRegexTreeVisitor.printNumMatchesToStdOut) { System.out.println(vis.numMatches()); } } catch (IOException e) { e.printStackTrace(); } catch (TregexParseException e) { errPW.println("Error parsing expression: " + args[0]); errPW.println("Parse exception: " + e.toString()); } }