예제 #1
0
  public ParseEssay() {
    System.setProperty("wordnet.database.dir", "../war/dict");
    synonyms = new ArrayList<String>();
    database = WordNetDatabase.getFileInstance();
    baos = new ByteArrayOutputStream();
    lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");

    // ??
  }
예제 #2
0
  /**
   * Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po
   * matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
   *
   * <h4>Arguments:</h4>
   *
   * Each argument should be the name of a transformation file that contains a list of pattern and
   * transformation operation list pairs. That is, it is a sequence of pairs of a {@link
   * TregexPattern} pattern on one or more lines, then a blank line (empty or whitespace), then a
   * list of transformation operations one per line (as specified by <b>Legal operation syntax</b>
   * below) to apply when the pattern is matched, and then another blank line (empty or whitespace).
   * Note the need for blank lines: The code crashes if they are not present as separators (although
   * the blank line at the end of the file can be omitted). The script file can include comment
   * lines, either whole comment lines or trailing comments introduced by %, which extend to the end
   * of line. A needed percent mark can be escaped by a preceding backslash.
   *
   * <p>For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
   * and relabel the SQ node to S, your transformation file would look like this:
   *
   * <blockquote>
   *
   * <code>
   *    SBARQ=n1 &lt; SQ=n2<br>
   *    <br>
   *    excise n1 n1<br>
   *    relabel n2 S
   * </code>
   *
   * </blockquote>
   *
   * <p>
   *
   * <h4>Options:</h4>
   *
   * <ul>
   *   <li><code>-treeFile &#60;filename&#62;</code> specify the name of the file that has the trees
   *       you want to transform.
   *   <li><code>-po &#60;matchPattern&#62; &#60;operation&#62;</code> Apply a single operation to
   *       every tree using the specified match pattern and the specified operation. Use this option
   *       when you want to quickly try the effect of one pattern/surgery combination, and are too
   *       lazy to write a transformation file.
   *   <li><code>-s</code> Print each output tree on one line (default is pretty-printing).
   *   <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as
   *       "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through
   *       the transducer as usual.
   *   <li><code>-encoding X</code> Uses character set X for input and output of trees.
   *   <li><code>-macros &#60;filename&#62;</code> A file of macros to use on the tregex pattern.
   *       Macros should be one per line, with original and replacement separated by tabs.
   *   <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *       to determine headship relations.
   *   <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *       class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *       arguments.
   *   <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *       TreeReaderFactory} class to read trees from files.
   * </ul>
   *
   * <h4>Legal operation syntax:</h4>
   *
   * <ul>
   *   <li><code>delete &#60;name&#62;</code> deletes the node and everything below it.
   *   <li><code>prune &#60;name&#62;</code> Like delete, but if, after the pruning, the parent has
   *       no children anymore, the parent is pruned too. Pruning continues to affect all ancestors
   *       until one is found with remaining children. This may result in a null tree.
   *   <li><code>excise &#60;name1&#62; &#60;name2&#62;</code> The name1 node should either dominate
   *       or be the same as the name2 node. This excises out everything from name1 to name2. All
   *       the children of name2 go into the parent of name1, where name1 was.
   *   <li><code>relabel &#60;name&#62; &#60;new-label&#62;</code> Relabels the node to have the new
   *       label. <br>
   *       There are three possible forms: <br>
   *       <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string <br>
   *       <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid
   *       identifier without quoting <br>
   *       <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based
   *       relabeling. In this case, all matches of the regular expression against the node label
   *       are replaced with the replacement String. This has the semantics of Java/Perl's
   *       replaceAll: you may use capturing groups and put them in replacements with $n. For
   *       example, if the pattern is /foo/bar/ and the node matched is "foo", the replaceAll
   *       semantics result in "barbar". If the pattern is /^foo(.*)$/bar$1/ and node matched is
   *       "foofoo", relabel will result in "barfoo". <br>
   *       When using the regex replacement method, you can also use the sequences ={node} and
   *       %{var} in the replacement string to use captured nodes or variable strings in the
   *       replacement string. For example, if the Tregex pattern was "duck=bar" and the relabel is
   *       /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br>
   *       To concatenate two nodes named in the tregex pattern, for example, you can use the
   *       pattern /^.*$/={foo}={bar}/. Note that the ^.*$ is necessary to make sure the regex
   *       pattern only matches and replaces once on the entire node name. <br>
   *       To get an "=" or a "%" in the replacement, using \ escaping. Also, as in the example you
   *       can escape a slash in the middle of the second and third forms with \\/ and \\\\. <br>
   *   <li><code>insert &#60;name&#62; &#60;position&#62;</code> or <code>
   *       insert &lt;tree&gt; &#60;position&#62;</code> inserts the named node or tree into the
   *       position specified.
   *   <li><code>move &#60;name&#62; &#60;position&#62;</code> moves the named node into the
   *       specified position.
   *       <p>Right now the only ways to specify position are:
   *       <p><code>$+ &#60;name&#62;</code> the left sister of the named node<br>
   *       <code>$- &#60;name&#62;</code> the right sister of the named node<br>
   *       <code>&gt;i &#60;name&#62;</code> the i_th daughter of the named node<br>
   *       <code>&gt;-i &#60;name&#62;</code> the i_th daughter, counting from the right, of the
   *       named node.
   *   <li><code>replace &#60;name1&#62; &#60;name2&#62;</code> deletes name1 and inserts a copy of
   *       name2 in its place.
   *   <li><code>replace &#60;name&#62; &#60;tree&#62; &#60;tree2&#62;...</code> deletes name and
   *       inserts the new tree(s) in its place. If more than one replacement tree is given, each of
   *       the new subtrees will be added in order where the old tree was. Multiple subtrees at the
   *       root is an illegal operation and will throw an exception.
   *   <li>{@code createSubtree <new-label> <name1> [<name2>]} Create a subtree out of all the nodes
   *       from {@code <name1>} through {@code <name2>} and puts the new subtree where that span
   *       used to be. To limit the operation to just one node, elide {@code <name2>}.
   *   <li><code>adjoin &#60;auxiliary_tree&#62; &lt;name&gt;</code> Adjoins the specified auxiliary
   *       tree into the named node. The daughters of the target node will become the daughters of
   *       the foot of the auxiliary tree.
   *   <li><code>adjoinH &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the root of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>. The root of the auxiliary tree is ignored.)
   *   <li><code>adjoinF &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the foot of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>, and retains its status as parent of its children. The root of the
   *       auxiliary tree is ignored.)
   *   <li>
   *   <dt><code>coindex &#60;name1&#62; &#60;name2&#62; ... &#60;nameM&#62; </code> Puts a (Penn
   *       Treebank style) coindexation suffix of the form "-N" on each of nodes name_1 through
   *       name_m. The value of N will be automatically generated in reference to the existing
   *       coindexations in the tree, so that there is never an accidental clash of indices across
   *       things that are not meant to be coindexed.
   * </ul>
   *
   * <p>In the context of <code>adjoin</code>, <code>adjoinH</code>, and <code>adjoinF</code>, an
   * auxiliary tree is a tree in Penn Treebank format with <code>@</code> on exactly one of the
   * leaves denoting the foot of the tree. The operations which use the foot use the labeled node.
   * For example: <br>
   * Tsurgeon: <code>adjoin (FOO (BAR@)) foo</code> <br>
   * Tregex: <code>B=foo</code> <br>
   * Input: <code>(A (B 1 2))</code> Output: <code>(A (FOO (BAR 1 2)))</code>
   *
   * <p>Tsurgeon applies the same operation to the same tree for as long as the given tregex
   * operation matches. This means that infinite loops are very easy to cause. One common situation
   * where this comes up is with an insert operation will repeats infinitely many times unless you
   * add an expression to the tregex that matches against the inserted pattern. For example, this
   * pattern will infinite loop:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * This pattern, though, will terminate:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP !&lt;&lt; foo"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * <p>Tsurgeon has (very) limited support for conditional statements. If a pattern is prefaced
   * with <code>if exists &lt;name&gt;</code>, the rest of the pattern will only execute if the
   * named node was found in the corresponding TregexMatcher.
   *
   * @param args a list of names of files each of which contains a single tregex matching pattern
   *     plus a list, one per line, of transformation operations to apply to the matched pattern.
   * @throws Exception If an I/O or pattern syntax error
   */
  public static void main(String[] args) throws Exception {
    String headFinderClassName = null;
    String headFinderOption = "-hf";
    String[] headFinderArgs = null;
    String headFinderArgOption = "-hfArg";
    String encoding = "UTF-8";
    String encodingOption = "-encoding";
    if (args.length == 0) {
      System.err.println(
          "Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
      System.exit(0);
    }
    String treePrintFormats;
    String singleLineOption = "-s";
    String verboseOption = "-v";
    String matchedOption =
        "-m"; // if set, then print original form of trees that are matched & thus operated on
    String patternOperationOption = "-po";
    String treeFileOption = "-treeFile";
    String trfOption = "-trf";
    String macroOption = "-macros";
    String macroFilename = "";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(patternOperationOption, 2);
    flagMap.put(treeFileOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(singleLineOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(headFinderOption, 1);
    flagMap.put(macroOption, 1);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(headFinderOption))
      headFinderClassName = argsMap.get(headFinderOption)[0];
    if (argsMap.containsKey(headFinderArgOption)) headFinderArgs = argsMap.get(headFinderArgOption);
    if (argsMap.containsKey(verboseOption)) verbose = true;
    if (argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,";
    else treePrintFormats = "penn,";
    if (argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0];
    if (argsMap.containsKey(macroOption)) macroFilename = argsMap.get(macroOption)[0];

    TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
    PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);

    TreeReaderFactory trf;
    if (argsMap.containsKey(trfOption)) {
      String trfClass = argsMap.get(trfOption)[0];
      trf = ReflectionLoading.loadByReflection(trfClass);
    } else {
      trf = new TregexPattern.TRegexTreeReaderFactory();
    }

    Treebank trees = new DiskTreebank(trf, encoding);
    if (argsMap.containsKey(treeFileOption)) {
      trees.loadPath(argsMap.get(treeFileOption)[0]);
    }
    List<Pair<TregexPattern, TsurgeonPattern>> ops =
        new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();

    TregexPatternCompiler compiler;
    if (headFinderClassName == null) {
      compiler = new TregexPatternCompiler();
    } else {
      HeadFinder hf;
      if (headFinderArgs == null) {
        hf = ReflectionLoading.loadByReflection(headFinderClassName);
      } else {
        hf = ReflectionLoading.loadByReflection(headFinderClassName, (Object[]) headFinderArgs);
      }
      compiler = new TregexPatternCompiler(hf);
    }
    Macros.addAllMacros(compiler, macroFilename, encoding);
    if (argsMap.containsKey(patternOperationOption)) {
      TregexPattern matchPattern = compiler.compile(argsMap.get(patternOperationOption)[0]);
      TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]);
      ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
    } else {
      for (String arg : args) {
        List<Pair<TregexPattern, TsurgeonPattern>> pairs =
            getOperationsFromFile(arg, encoding, compiler);
        for (Pair<TregexPattern, TsurgeonPattern> pair : pairs) {
          if (verbose) {
            System.err.println(pair.second());
          }
          ops.add(pair);
        }
      }
    }

    for (Tree t : trees) {
      Tree original = t.deepCopy();
      Tree result = processPatternsOnTree(ops, t);
      if (argsMap.containsKey(matchedOption) && matchedOnTree) {
        pwOut.println("Operated on: ");
        displayTree(original, tp, pwOut);
        pwOut.println("Result: ");
      }
      displayTree(result, tp, pwOut);
    }
  }
예제 #3
0
  public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.

    System.out.println("Currently " + new Date());
    System.out.print("Invoked with arguments:");
    for (String arg : args) {
      System.out.print(" " + arg);
    }
    System.out.println();

    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
        serializeFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
        try {
          op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
        } catch (ClassNotFoundException e) {
          System.err.println("Class not found: " + args[i + 1]);
        } catch (InstantiationException e) {
          System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
        } catch (IllegalAccessException e) {
          System.err.println("illegal access" + e);
        }
        i += 2;
      } else if (args[i].equals("-encoding")) {
        // sets encoding for TreebankLangParserParams
        op.tlpParams.setInputEncoding(args[i + 1]);
        op.tlpParams.setOutputEncoding(args[i + 1]);
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    Test.display();
    Train.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (Test.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");
    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer = null;
    if (!Train.leftToRight) {
      binarizer =
          new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true);
    } else {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams.headFinder(),
              new LeftHeadFinder(),
              op.tlpParams,
              op.forceCNF,
              !Train.outsideFactor(),
              true);
    }
    CollinsPuncTransformer collinsPuncTransformer = null;
    if (Train.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (Train.selectiveSplit) {
      Train.splitters =
          ParentAnnotationStats.getSplitCategories(
              trainTreebank,
              Train.tagSelectiveSplit,
              0,
              Train.selectiveSplitCutOff,
              Train.tagSelectiveSplitCutOff,
              op.tlpParams.treebankLanguagePack());
      if (Train.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : Train.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (Train.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      Train.postSplitters =
          ParentAnnotationStats.getSplitCategories(
              annotatedTB,
              true,
              0,
              Train.selectivePostSplitCutOff,
              Train.tagSelectivePostSplitCutOff,
              op.tlpParams.treebankLanguagePack());
    }

    if (Train.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (Train.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        // tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        // binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (Test.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done."); // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    // extract grammars
    Extractor bgExtractor = new BinaryGrammarExtractor();
    // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    // Extractor dgExtractor = new DependencyMemGrammarExtractor();

    Extractor dgExtractor = new MLEDependencyGrammarExtractor(op);
    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair bgug = null;
      if (Train.cheatPCFG) {
        List allTrees = new ArrayList(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = (Pair) bgExtractor.extract(allTrees);
      } else {
        bgug = (Pair) bgExtractor.extract(binaryTrainTrees);
      }
      bg = (BinaryGrammar) bgug.second;
      bg.splitRules();
      ug = (UnaryGrammar) bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    lex = op.tlpParams.lex(op.lexOptions);
    lex.train(binaryTrainTrees);
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams,true));

      DependencyGrammar dg1 =
          (DependencyGrammar)
              dgExtractor.extract(
                  trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
      // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new
      // TransformTreeDependency(tlpParams));

      // dg = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether
      // the words are known or not, discards unknown words
      Timing.tick("done.");
      // System.out.print("Extracting Unknown Word Model...");
      // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      // Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      // System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser.saveParserDataToSerialized(
          new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op);
    }

    ExhaustiveDependencyParser dparser =
        ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null);

    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null);
    // Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser =
          (Test.useN5)
              ? new BiLexPCFGParser.N5BiLexPCFGParser(
                  scorer, parser, dparser, bg, ug, dg, lex, op, gp)
              : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp);
    }

    LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg  PE", true, tlp);
    LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp);
    AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg  CB", true, tlp);

    AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg  TE");
    AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE");
    AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE");

    AbstractEval depDE =
        new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter());
    AbstractEval comboDE =
        new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter());

    if (Test.evalb) {
      EvalB.initEVALBfiles(op.tlpParams);
    }

    // int[] countByLength = new int[Test.maxLength+1];

    // use a reflection ruse, so one can run this without needing the tagger
    // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new
    // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null);
    SentenceProcessor tagger = null;
    if (Test.preTag) {
      try {
        Class[] argsClass = new Class[] {String.class};
        Object[] arguments =
            new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"};
        tagger =
            (SentenceProcessor)
                Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger")
                    .getConstructor(argsClass)
                    .newInstance(arguments);
      } catch (Exception e) {
        System.err.println(e);
        System.err.println("Warning: No pretagging of sentences will be done.");
      }
    }

    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
      Tree tree = testTreebank.get(tNum);
      int testTreeLen = tree.yield().size();
      if (testTreeLen > Test.maxLength) {
        continue;
      }
      Tree binaryTree = binaryTestTrees.get(tNum);
      // countByLength[testTreeLen]++;
      System.out.println("-------------------------------------");
      System.out.println("Number: " + (tNum + 1));
      System.out.println("Length: " + testTreeLen);

      // tree.pennPrint(pw);
      // System.out.println("XXXX The binary tree is");
      // binaryTree.pennPrint(pw);
      // System.out.println("Here are the tags in the lexicon:");
      // System.out.println(lex.showTags());
      // System.out.println("Here's the tagnumberer:");
      // System.out.println(Numberer.getGlobalNumberer("tags").toString());

      long timeMil1 = System.currentTimeMillis();
      Timing.tick("Starting parse.");
      if (op.doPCFG) {
        // System.err.println(Test.forceTags);
        if (Test.forceTags) {
          if (tagger != null) {
            // System.out.println("Using a tagger to set tags");
            // System.out.println("Tagged sentence as: " +
            // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
            parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield())))));
          } else {
            // System.out.println("Forcing tags to match input.");
            parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
          }
        } else {
          // System.out.println("XXXX Parsing " + binaryTree.yield());
          parser.parse(binaryTree.yield());
        }
        // Timing.tick("Done with pcfg phase.");
      }
      if (op.doDep) {
        dparser.parse(binaryTree.yield());
        // Timing.tick("Done with dependency phase.");
      }
      boolean bothPassed = false;
      if (op.doPCFG && op.doDep) {
        bothPassed = bparser.parse(binaryTree.yield());
        // Timing.tick("Done with combination phase.");
      }
      long timeMil2 = System.currentTimeMillis();
      long elapsed = timeMil2 - timeMil1;
      System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
      // System.out.println("PCFG Best Parse:");
      Tree tree2b = null;
      Tree tree2 = null;
      // System.out.println("Got full best parse...");
      if (op.doPCFG) {
        tree2b = parser.getBestParse();
        tree2 = debinarizer.transformTree(tree2b);
      }
      // System.out.println("Debinarized parse...");
      // tree2.pennPrint();
      // System.out.println("DepG Best Parse:");
      Tree tree3 = null;
      Tree tree3db = null;
      if (op.doDep) {
        tree3 = dparser.getBestParse();
        // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
        tree3db = debinarizer.transformTree(tree3);
        tree3.pennPrint(pw);
      }
      // tree.pennPrint();
      // ((Tree)binaryTrainTrees.get(tNum)).pennPrint();
      // System.out.println("Combo Best Parse:");
      Tree tree4 = null;
      if (op.doPCFG && op.doDep) {
        try {
          tree4 = bparser.getBestParse();
          if (tree4 == null) {
            tree4 = tree2b;
          }
        } catch (NullPointerException e) {
          System.err.println("Blocked, using PCFG parse!");
          tree4 = tree2b;
        }
      }
      if (op.doPCFG && !bothPassed) {
        tree4 = tree2b;
      }
      // tree4.pennPrint();
      if (op.doDep) {
        depDE.evaluate(tree3, binaryTree, pw);
        depTE.evaluate(tree3db, tree, pw);
      }
      TreeTransformer tc = op.tlpParams.collinizer();
      TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
      Tree tree4b = null;
      if (op.doPCFG) {
        // System.out.println("XXXX Best PCFG was: ");
        // tree2.pennPrint();
        // System.out.println("XXXX Transformed best PCFG is: ");
        // tc.transformTree(tree2).pennPrint();
        // System.out.println("True Best Parse:");
        // tree.pennPrint();
        // tc.transformTree(tree).pennPrint();
        pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        if (op.doDep) {
          comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
          tree4b = tree4;
          tree4 = debinarizer.transformTree(tree4);
          if (op.nodePrune) {
            NodePruner np = new NodePruner(parser, debinarizer);
            tree4 = np.prune(tree4);
          }
          // tree4.pennPrint();
          comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        // pcfgTE.evaluate(tree2, tree);
        pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
        pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);

        if (op.doDep) {
          comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
          comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

        // tc.transformTree(tree2).pennPrint();
        tree2.pennPrint(pw);

        if (op.doDep) {
          System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
          // tc.transformTree(tree4).pennPrint(pw);
          tree4.pennPrint(pw);
        }
        System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
        /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
        tree.pennPrint(pw);
      } // end if doPCFG

      if (Test.evalb) {
        if (op.doPCFG && op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
        } else if (op.doPCFG) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
        } else if (op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
        }
      }
    } // end for each tree in test treebank

    if (Test.evalb) {
      EvalB.closeEVALBfiles();
    }

    // Test.display();
    if (op.doPCFG) {
      pcfgPE.display(false, pw);
      System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total());
      pcfgCB.display(false, pw);
      if (op.doDep) {
        comboPE.display(false, pw);
      }
      pcfgTE.display(false, pw);
      pcfgTEnoPunct.display(false, pw);
      if (op.doDep) {
        comboTE.display(false, pw);
        comboTEnoPunct.display(false, pw);
      }
    }
    if (op.doDep) {
      depTE.display(false, pw);
      depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
      comboDE.display(false, pw);
    }
    // pcfgPE.printGoodBad();
  }
예제 #4
0
  /**
   * Prints out all matches of a tree pattern on each tree in the path. Usage: <br>
   * <br>
   * <code>
   * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h &lt;node-name&gt;]]* pattern
   *  filepath   </code>
   *
   * <p>Arguments:<br>
   *
   * <ul>
   *   <li><code>pattern</code>: the tree pattern which optionally names some set of nodes (i.e.,
   *       gives it the "handle") <code>=name</code> (for some arbitrary string "name")
   *   <li><code>filepath</code>: the path to files with trees. If this is a directory, there will
   *       be recursive descent and the pattern will be run on all files beneath the specified
   *       directory.
   * </ul>
   *
   * <p>Options:<br>
   * <li><code>-C</code> suppresses printing of matches, so only the number of matches is printed.
   * <li><code>-w</code> causes the whole of a tree that matches to be printed.
   * <li><code>-f</code> causes the filename to be printed.
   * <li><code>-i &lt;filename&gt;</code> causes the pattern to be matched to be read from <code>
   *     &lt;filename&gt;</code> rather than the command line. Don't specify a pattern when this
   *     option is used.
   * <li><code>-o</code> Specifies that each tree node can be reported only once as the root of a
   *     match (by default a node will be printed once for every <em>way</em> the pattern matches).
   * <li><code>-s</code> causes trees to be printed all on one line (by default they are pretty
   *     printed).
   * <li><code>-n</code> causes the number of the tree in which the match was found to be printed
   *     before every match.
   * <li><code>-u</code> causes only the label of each matching node to be printed, not complete
   *     subtrees.
   * <li><code>-t</code> causes only the yield (terminal words) of the selected node to be printed
   *     (or the yield of the whole tree, if the <code>-w</code> option is used).
   * <li><code>-encoding &lt;charset_encoding&gt;</code> option allows specification of character
   *     encoding of trees..
   * <li><code>-h &lt;node-handle&gt;</code> If a <code>-h</code> option is given, the root tree
   *     node will not be printed. Instead, for each <code>node-handle</code> specified, the node
   *     matched and given that handle will be printed. Multiple nodes can be printed by using the
   *     <code>-h</code> option multiple times on a single command line.
   * <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *     to determine headship relations.
   * <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *     class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *     arguments.
   * <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *     TreeReaderFactory} class to read trees from files.
   * <li><code>-v</code> print every tree that contains no matches of the specified pattern, but
   *     print no matches to the pattern.
   * <li><code>-x</code> Instead of the matched subtree, print the matched subtree's identifying
   *     number as defined in <tt>tgrep2</tt>:a unique identifier for the subtree and is in the form
   *     s:n, where s is an integer specifying the sentence number in the corpus (starting with 1),
   *     and n is an integer giving the order in which the node is encountered in a depth-first
   *     search starting with 1 at top node in the sentence tree.
   * <li><code>-extract &lt;code&gt; &lt;tree-file&gt;</code> extracts the subtree s:n specified by
   *     <tt>code</tt> from the specified <tt>tree-file</tt>. Overrides all other behavior of
   *     tregex. Can't specify multiple encodings etc. yet.
   * <li><code>-extractFile &lt;code-file&gt; &lt;tree-file&gt;</code> extracts every subtree
   *     specified by the subtree codes in <tt>code-file</tt>, which must appear exactly one per
   *     line, from the specified <tt>tree-file</tt>. Overrides all other behavior of tregex. Can't
   *     specify multiple encodings etc. yet.
   * <li><code>-filter</code> causes this to act as a filter, reading tree input from stdin
   * <li><code>-T</code> causes all trees to be printed as processed (for debugging purposes).
   *     Otherwise only matching nodes are printed.
   * <li><code>-macros &lt;filename&gt;</code> filename with macro substitutions to use. file with
   *     tab separated lines original-tab-replacement
   * </ul>
   */
  public static void main(String[] args) throws IOException {
    Timing.startTime();

    StringBuilder treePrintFormats = new StringBuilder();
    String printNonMatchingTreesOption = "-v";
    String subtreeCodeOption = "-x";
    String extractSubtreesOption = "-extract";
    String extractSubtreesFileOption = "-extractFile";
    String inputFileOption = "-i";
    String headFinderOption = "-hf";
    String headFinderArgOption = "-hfArg";
    String trfOption = "-trf";
    String headFinderClassName = null;
    String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY;
    String treeReaderFactoryClassName = null;
    String printHandleOption = "-h";
    String markHandleOption = "-k";
    String encodingOption = "-encoding";
    String encoding = "UTF-8";
    String macroOption = "-macros";
    String macroFilename = "";
    String yieldOnly = "-t";
    String printAllTrees = "-T";
    String quietMode = "-C";
    String wholeTreeMode = "-w";
    String filenameOption = "-f";
    String oneMatchPerRootNodeMode = "-o";
    String reportTreeNumbers = "-n";
    String rootLabelOnly = "-u";
    String oneLine = "-s";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(extractSubtreesOption, 2);
    flagMap.put(extractSubtreesFileOption, 2);
    flagMap.put(subtreeCodeOption, 0);
    flagMap.put(printNonMatchingTreesOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(inputFileOption, 1);
    flagMap.put(printHandleOption, 1);
    flagMap.put(markHandleOption, 2);
    flagMap.put(headFinderOption, 1);
    flagMap.put(headFinderArgOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(macroOption, 1);
    flagMap.put(yieldOnly, 0);
    flagMap.put(quietMode, 0);
    flagMap.put(wholeTreeMode, 0);
    flagMap.put(printAllTrees, 0);
    flagMap.put(filenameOption, 0);
    flagMap.put(oneMatchPerRootNodeMode, 0);
    flagMap.put(reportTreeNumbers, 0);
    flagMap.put(rootLabelOnly, 0);
    flagMap.put(oneLine, 0);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(encodingOption)) {
      encoding = argsMap.get(encodingOption)[0];
      System.err.println("Encoding set to " + encoding);
    }
    PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true);

    if (argsMap.containsKey(extractSubtreesOption)) {
      List<String> subTreeStrings =
          Collections.singletonList(argsMap.get(extractSubtreesOption)[0]);
      extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]);
      return;
    }
    if (argsMap.containsKey(extractSubtreesFileOption)) {
      List<String> subTreeStrings =
          Arrays.asList(
              IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r"));
      extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]);
      return;
    }

    if (args.length < 1) {
      errPW.println(
          "Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-f] [-o] [-n] [-s] [-filter]  [-hf class] [-trf class] [-h handle]* pattern [filepath]");
      return;
    }
    String matchString = args[0];

    if (argsMap.containsKey(macroOption)) {
      macroFilename = argsMap.get(macroOption)[0];
    }
    if (argsMap.containsKey(headFinderOption)) {
      headFinderClassName = argsMap.get(headFinderOption)[0];
      errPW.println("Using head finder " + headFinderClassName + "...");
    }
    if (argsMap.containsKey(headFinderArgOption)) {
      headFinderArgs = argsMap.get(headFinderArgOption);
    }
    if (argsMap.containsKey(trfOption)) {
      treeReaderFactoryClassName = argsMap.get(trfOption)[0];
      errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "...");
    }
    if (argsMap.containsKey(printAllTrees)) {
      TRegexTreeVisitor.printTree = true;
    }
    if (argsMap.containsKey(inputFileOption)) {
      String inputFile = argsMap.get(inputFileOption)[0];
      matchString = IOUtils.slurpFile(inputFile, encoding);
      String[] newArgs = new String[args.length + 1];
      System.arraycopy(args, 0, newArgs, 1, args.length);
      args = newArgs;
    }
    if (argsMap.containsKey(quietMode)) {
      TRegexTreeVisitor.printMatches = false;
      TRegexTreeVisitor.printNumMatchesToStdOut = true;
    }
    if (argsMap.containsKey(printNonMatchingTreesOption)) {
      TRegexTreeVisitor.printNonMatchingTrees = true;
    }
    if (argsMap.containsKey(subtreeCodeOption)) {
      TRegexTreeVisitor.printSubtreeCode = true;
      TRegexTreeVisitor.printMatches = false;
    }
    if (argsMap.containsKey(wholeTreeMode)) {
      TRegexTreeVisitor.printWholeTree = true;
    }
    if (argsMap.containsKey(filenameOption)) {
      TRegexTreeVisitor.printFilename = true;
    }
    if (argsMap.containsKey(oneMatchPerRootNodeMode)) TRegexTreeVisitor.oneMatchPerRootNode = true;
    if (argsMap.containsKey(reportTreeNumbers)) TRegexTreeVisitor.reportTreeNumbers = true;
    if (argsMap.containsKey(rootLabelOnly)) {
      treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(',');
    } else if (argsMap.containsKey(oneLine)) { // display short form
      treePrintFormats.append("oneline,");
    } else if (argsMap.containsKey(yieldOnly)) {
      treePrintFormats.append("words,");
    } else {
      treePrintFormats.append("penn,");
    }

    HeadFinder hf = new CollinsHeadFinder();
    if (headFinderClassName != null) {
      Class[] hfArgClasses = new Class[headFinderArgs.length];
      for (int i = 0; i < hfArgClasses.length; i++) hfArgClasses[i] = String.class;
      try {
        hf =
            (HeadFinder)
                Class.forName(headFinderClassName)
                    .getConstructor(hfArgClasses)
                    .newInstance(
                        (Object[])
                            headFinderArgs); // cast to Object[] necessary to avoid varargs-related
        // warning.
      } catch (Exception e) {
        throw new RuntimeException("Error occurred while constructing HeadFinder: " + e);
      }
    }

    TRegexTreeVisitor.tp =
        new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack());

    try {
      // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``");
      TregexPatternCompiler tpc = new TregexPatternCompiler(hf);
      Macros.addAllMacros(tpc, macroFilename, encoding);
      TregexPattern p = tpc.compile(matchString);
      errPW.println("Pattern string:\n" + p.pattern());
      errPW.println("Parsed representation:");
      p.prettyPrint(errPW);

      String[] handles = argsMap.get(printHandleOption);
      if (argsMap.containsKey("-filter")) {
        TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
        treebank =
            new MemoryTreebank(
                trf, encoding); // has to be in memory since we're not storing it on disk
        // read from stdin
        Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding));
        ((MemoryTreebank) treebank).load(reader);
        reader.close();
      } else if (args.length == 1) {
        errPW.println("using default tree");
        TreeReader r =
            new PennTreeReader(
                new StringReader(
                    "(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"),
                new LabeledScoredTreeFactory(new StringLabelFactory()));
        Tree t = r.readTree();
        treebank = new MemoryTreebank();
        treebank.add(t);
      } else {
        int last = args.length - 1;
        errPW.println("Reading trees from file(s) " + args[last]);
        TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
        treebank = new DiskTreebank(trf, encoding);
        treebank.loadPath(args[last], null, true);
      }
      TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding);

      treebank.apply(vis);
      Timing.endTime();
      if (TRegexTreeVisitor.printMatches) {
        errPW.println("There were " + vis.numMatches() + " matches in total.");
      }
      if (TRegexTreeVisitor.printNumMatchesToStdOut) {
        System.out.println(vis.numMatches());
      }
    } catch (IOException e) {
      e.printStackTrace();
    } catch (TregexParseException e) {
      errPW.println("Error parsing expression: " + args[0]);
      errPW.println("Parse exception: " + e.toString());
    }
  }