Пример #1
0
 /**
  * Loads treebank grammar from first argument and prints it. Just a demonstration of
  * functionality. <br>
  * <code>usage: java MemoryTreebank treebankFilesPath</code>
  *
  * @param args array of command-line arguments
  */
 public static void main(String[] args) {
   Timing.startTime();
   Treebank treebank =
       new MemoryTreebank(
           new TreeReaderFactory() {
             public TreeReader newTreeReader(Reader in) {
               return new PennTreeReader(in);
             }
           });
   treebank.loadPath(args[0]);
   Timing.endTime();
   System.out.println(treebank);
 }
  /** For testing: loads a treebank and prints the trees. */
  public static void main(String[] args) {
    TreebankLangParserParams tlpp = new ChineseTreebankParserParams();
    System.out.println("Default encoding is: " + tlpp.diskTreebank().encoding());

    if (args.length < 2) {
      printlnErr(
          "Usage: edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams treesPath fileRange");
    } else {
      Treebank m = tlpp.diskTreebank();
      m.loadPath(args[0], new NumberRangesFileFilter(args[1], false));

      for (Tree t : m) {
        t.pennPrint(tlpp.pw());
      }
      System.out.println("There were " + m.size() + " trees.");
    }
  }
Пример #3
0
  /**
   * Loads treebank grammar from first argument and prints it. Just a demonstration of
   * functionality. <br>
   * <code>usage: java MemoryTreebank treebankFilesPath</code>
   *
   * @param args array of command-line arguments
   */
  public static void main(String[] args) {
    Timing.startTime();
    Treebank treebank =
        new DiskTreebank(
            new TreeReaderFactory() {
              public TreeReader newTreeReader(Reader in) {
                return new PennTreeReader(in);
              }
            });
    Treebank treebank2 =
        new MemoryTreebank(
            new TreeReaderFactory() {
              public TreeReader newTreeReader(Reader in) {
                return new PennTreeReader(in);
              }
            });
    treebank.loadPath(args[0]);
    treebank2.loadPath(args[0]);
    CompositeTreebank c = new CompositeTreebank(treebank, treebank2);
    Timing.endTime();
    TreeTransformer myTransformer = new MyTreeTransformer();
    TreeTransformer myTransformer2 = new MyTreeTransformer2();
    TreeTransformer myTransformer3 = new MyTreeTransformer3();
    Treebank tf1 = c.transform(myTransformer).transform(myTransformer2).transform(myTransformer3);
    Treebank tf2 =
        new TransformingTreebank(
            new TransformingTreebank(new TransformingTreebank(c, myTransformer), myTransformer2),
            myTransformer3);
    TreeTransformer[] tta = {myTransformer, myTransformer2, myTransformer3};
    TreeTransformer tt3 = new CompositeTreeTransformer(Arrays.asList(tta));
    Treebank tf3 = c.transform(tt3);

    System.out.println("-------------------------");
    System.out.println("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK");
    System.out.println(c);
    System.out.println("-------------------------");
    System.out.println("SLOWLY TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR");
    Treebank tx1 = new TransformingTreebank(c, myTransformer);
    System.out.println(tx1);
    System.out.println("-----");
    Treebank tx2 = new TransformingTreebank(tx1, myTransformer2);
    System.out.println(tx2);
    System.out.println("-----");
    Treebank tx3 = new TransformingTreebank(tx2, myTransformer3);
    System.out.println(tx3);
    System.out.println("-------------------------");
    System.out.println("TRANSFORMED TREEBANK, USING Treebank.transform()");
    System.out.println(tf1);
    System.out.println("-------------------------");
    System.out.println("PRINTING AGAIN TRANSFORMED TREEBANK, USING Treebank.transform()");
    System.out.println(tf1);
    System.out.println("-------------------------");
    System.out.println("TRANSFORMED TREEBANK, USING TransformingTreebank() CONSTRUCTOR");
    System.out.println(tf2);
    System.out.println("-------------------------");
    System.out.println("TRANSFORMED TREEBANK, USING CompositeTreeTransformer");
    System.out.println(tf3);
    System.out.println("-------------------------");
    System.out.println("COMPOSITE (DISK THEN MEMORY REPEATED VERSION OF) INPUT TREEBANK");
    System.out.println(c);
    System.out.println("-------------------------");
  } // end main
Пример #4
0
  /**
   * Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po
   * matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
   *
   * <h4>Arguments:</h4>
   *
   * Each argument should be the name of a transformation file that contains a list of pattern and
   * transformation operation list pairs. That is, it is a sequence of pairs of a {@link
   * TregexPattern} pattern on one or more lines, then a blank line (empty or whitespace), then a
   * list of transformation operations one per line (as specified by <b>Legal operation syntax</b>
   * below) to apply when the pattern is matched, and then another blank line (empty or whitespace).
   * Note the need for blank lines: The code crashes if they are not present as separators (although
   * the blank line at the end of the file can be omitted). The script file can include comment
   * lines, either whole comment lines or trailing comments introduced by %, which extend to the end
   * of line. A needed percent mark can be escaped by a preceding backslash.
   *
   * <p>For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
   * and relabel the SQ node to S, your transformation file would look like this:
   *
   * <blockquote>
   *
   * <code>
   *    SBARQ=n1 &lt; SQ=n2<br>
   *    <br>
   *    excise n1 n1<br>
   *    relabel n2 S
   * </code>
   *
   * </blockquote>
   *
   * <p>
   *
   * <h4>Options:</h4>
   *
   * <ul>
   *   <li><code>-treeFile &#60;filename&#62;</code> specify the name of the file that has the trees
   *       you want to transform.
   *   <li><code>-po &#60;matchPattern&#62; &#60;operation&#62;</code> Apply a single operation to
   *       every tree using the specified match pattern and the specified operation. Use this option
   *       when you want to quickly try the effect of one pattern/surgery combination, and are too
   *       lazy to write a transformation file.
   *   <li><code>-s</code> Print each output tree on one line (default is pretty-printing).
   *   <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as
   *       "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through
   *       the transducer as usual.
   *   <li><code>-encoding X</code> Uses character set X for input and output of trees.
   *   <li><code>-macros &#60;filename&#62;</code> A file of macros to use on the tregex pattern.
   *       Macros should be one per line, with original and replacement separated by tabs.
   *   <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *       to determine headship relations.
   *   <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *       class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *       arguments.
   *   <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *       TreeReaderFactory} class to read trees from files.
   * </ul>
   *
   * <h4>Legal operation syntax:</h4>
   *
   * <ul>
   *   <li><code>delete &#60;name&#62;</code> deletes the node and everything below it.
   *   <li><code>prune &#60;name&#62;</code> Like delete, but if, after the pruning, the parent has
   *       no children anymore, the parent is pruned too. Pruning continues to affect all ancestors
   *       until one is found with remaining children. This may result in a null tree.
   *   <li><code>excise &#60;name1&#62; &#60;name2&#62;</code> The name1 node should either dominate
   *       or be the same as the name2 node. This excises out everything from name1 to name2. All
   *       the children of name2 go into the parent of name1, where name1 was.
   *   <li><code>relabel &#60;name&#62; &#60;new-label&#62;</code> Relabels the node to have the new
   *       label. <br>
   *       There are three possible forms: <br>
   *       <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string <br>
   *       <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid
   *       identifier without quoting <br>
   *       <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based
   *       relabeling. In this case, all matches of the regular expression against the node label
   *       are replaced with the replacement String. This has the semantics of Java/Perl's
   *       replaceAll: you may use capturing groups and put them in replacements with $n. For
   *       example, if the pattern is /foo/bar/ and the node matched is "foo", the replaceAll
   *       semantics result in "barbar". If the pattern is /^foo(.*)$/bar$1/ and node matched is
   *       "foofoo", relabel will result in "barfoo". <br>
   *       When using the regex replacement method, you can also use the sequences ={node} and
   *       %{var} in the replacement string to use captured nodes or variable strings in the
   *       replacement string. For example, if the Tregex pattern was "duck=bar" and the relabel is
   *       /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br>
   *       To concatenate two nodes named in the tregex pattern, for example, you can use the
   *       pattern /^.*$/={foo}={bar}/. Note that the ^.*$ is necessary to make sure the regex
   *       pattern only matches and replaces once on the entire node name. <br>
   *       To get an "=" or a "%" in the replacement, using \ escaping. Also, as in the example you
   *       can escape a slash in the middle of the second and third forms with \\/ and \\\\. <br>
   *   <li><code>insert &#60;name&#62; &#60;position&#62;</code> or <code>
   *       insert &lt;tree&gt; &#60;position&#62;</code> inserts the named node or tree into the
   *       position specified.
   *   <li><code>move &#60;name&#62; &#60;position&#62;</code> moves the named node into the
   *       specified position.
   *       <p>Right now the only ways to specify position are:
   *       <p><code>$+ &#60;name&#62;</code> the left sister of the named node<br>
   *       <code>$- &#60;name&#62;</code> the right sister of the named node<br>
   *       <code>&gt;i &#60;name&#62;</code> the i_th daughter of the named node<br>
   *       <code>&gt;-i &#60;name&#62;</code> the i_th daughter, counting from the right, of the
   *       named node.
   *   <li><code>replace &#60;name1&#62; &#60;name2&#62;</code> deletes name1 and inserts a copy of
   *       name2 in its place.
   *   <li><code>replace &#60;name&#62; &#60;tree&#62; &#60;tree2&#62;...</code> deletes name and
   *       inserts the new tree(s) in its place. If more than one replacement tree is given, each of
   *       the new subtrees will be added in order where the old tree was. Multiple subtrees at the
   *       root is an illegal operation and will throw an exception.
   *   <li>{@code createSubtree <new-label> <name1> [<name2>]} Create a subtree out of all the nodes
   *       from {@code <name1>} through {@code <name2>} and puts the new subtree where that span
   *       used to be. To limit the operation to just one node, elide {@code <name2>}.
   *   <li><code>adjoin &#60;auxiliary_tree&#62; &lt;name&gt;</code> Adjoins the specified auxiliary
   *       tree into the named node. The daughters of the target node will become the daughters of
   *       the foot of the auxiliary tree.
   *   <li><code>adjoinH &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the root of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>. The root of the auxiliary tree is ignored.)
   *   <li><code>adjoinF &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the foot of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>, and retains its status as parent of its children. The root of the
   *       auxiliary tree is ignored.)
   *   <li>
   *   <dt><code>coindex &#60;name1&#62; &#60;name2&#62; ... &#60;nameM&#62; </code> Puts a (Penn
   *       Treebank style) coindexation suffix of the form "-N" on each of nodes name_1 through
   *       name_m. The value of N will be automatically generated in reference to the existing
   *       coindexations in the tree, so that there is never an accidental clash of indices across
   *       things that are not meant to be coindexed.
   * </ul>
   *
   * <p>In the context of <code>adjoin</code>, <code>adjoinH</code>, and <code>adjoinF</code>, an
   * auxiliary tree is a tree in Penn Treebank format with <code>@</code> on exactly one of the
   * leaves denoting the foot of the tree. The operations which use the foot use the labeled node.
   * For example: <br>
   * Tsurgeon: <code>adjoin (FOO (BAR@)) foo</code> <br>
   * Tregex: <code>B=foo</code> <br>
   * Input: <code>(A (B 1 2))</code> Output: <code>(A (FOO (BAR 1 2)))</code>
   *
   * <p>Tsurgeon applies the same operation to the same tree for as long as the given tregex
   * operation matches. This means that infinite loops are very easy to cause. One common situation
   * where this comes up is with an insert operation will repeats infinitely many times unless you
   * add an expression to the tregex that matches against the inserted pattern. For example, this
   * pattern will infinite loop:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * This pattern, though, will terminate:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP !&lt;&lt; foo"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * <p>Tsurgeon has (very) limited support for conditional statements. If a pattern is prefaced
   * with <code>if exists &lt;name&gt;</code>, the rest of the pattern will only execute if the
   * named node was found in the corresponding TregexMatcher.
   *
   * @param args a list of names of files each of which contains a single tregex matching pattern
   *     plus a list, one per line, of transformation operations to apply to the matched pattern.
   * @throws Exception If an I/O or pattern syntax error
   */
  public static void main(String[] args) throws Exception {
    String headFinderClassName = null;
    String headFinderOption = "-hf";
    String[] headFinderArgs = null;
    String headFinderArgOption = "-hfArg";
    String encoding = "UTF-8";
    String encodingOption = "-encoding";
    if (args.length == 0) {
      System.err.println(
          "Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
      System.exit(0);
    }
    String treePrintFormats;
    String singleLineOption = "-s";
    String verboseOption = "-v";
    String matchedOption =
        "-m"; // if set, then print original form of trees that are matched & thus operated on
    String patternOperationOption = "-po";
    String treeFileOption = "-treeFile";
    String trfOption = "-trf";
    String macroOption = "-macros";
    String macroFilename = "";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(patternOperationOption, 2);
    flagMap.put(treeFileOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(singleLineOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(headFinderOption, 1);
    flagMap.put(macroOption, 1);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(headFinderOption))
      headFinderClassName = argsMap.get(headFinderOption)[0];
    if (argsMap.containsKey(headFinderArgOption)) headFinderArgs = argsMap.get(headFinderArgOption);
    if (argsMap.containsKey(verboseOption)) verbose = true;
    if (argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,";
    else treePrintFormats = "penn,";
    if (argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0];
    if (argsMap.containsKey(macroOption)) macroFilename = argsMap.get(macroOption)[0];

    TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
    PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);

    TreeReaderFactory trf;
    if (argsMap.containsKey(trfOption)) {
      String trfClass = argsMap.get(trfOption)[0];
      trf = ReflectionLoading.loadByReflection(trfClass);
    } else {
      trf = new TregexPattern.TRegexTreeReaderFactory();
    }

    Treebank trees = new DiskTreebank(trf, encoding);
    if (argsMap.containsKey(treeFileOption)) {
      trees.loadPath(argsMap.get(treeFileOption)[0]);
    }
    List<Pair<TregexPattern, TsurgeonPattern>> ops =
        new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();

    TregexPatternCompiler compiler;
    if (headFinderClassName == null) {
      compiler = new TregexPatternCompiler();
    } else {
      HeadFinder hf;
      if (headFinderArgs == null) {
        hf = ReflectionLoading.loadByReflection(headFinderClassName);
      } else {
        hf = ReflectionLoading.loadByReflection(headFinderClassName, (Object[]) headFinderArgs);
      }
      compiler = new TregexPatternCompiler(hf);
    }
    Macros.addAllMacros(compiler, macroFilename, encoding);
    if (argsMap.containsKey(patternOperationOption)) {
      TregexPattern matchPattern = compiler.compile(argsMap.get(patternOperationOption)[0]);
      TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]);
      ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
    } else {
      for (String arg : args) {
        List<Pair<TregexPattern, TsurgeonPattern>> pairs =
            getOperationsFromFile(arg, encoding, compiler);
        for (Pair<TregexPattern, TsurgeonPattern> pair : pairs) {
          if (verbose) {
            System.err.println(pair.second());
          }
          ops.add(pair);
        }
      }
    }

    for (Tree t : trees) {
      Tree original = t.deepCopy();
      Tree result = processPatternsOnTree(ops, t);
      if (argsMap.containsKey(matchedOption) && matchedOnTree) {
        pwOut.println("Operated on: ");
        displayTree(original, tp, pwOut);
        pwOut.println("Result: ");
      }
      displayTree(result, tp, pwOut);
    }
  }
Пример #5
0
  public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.

    System.out.println("Currently " + new Date());
    System.out.print("Invoked with arguments:");
    for (String arg : args) {
      System.out.print(" " + arg);
    }
    System.out.println();

    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
        serializeFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
        try {
          op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
        } catch (ClassNotFoundException e) {
          System.err.println("Class not found: " + args[i + 1]);
        } catch (InstantiationException e) {
          System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
        } catch (IllegalAccessException e) {
          System.err.println("illegal access" + e);
        }
        i += 2;
      } else if (args[i].equals("-encoding")) {
        // sets encoding for TreebankLangParserParams
        op.tlpParams.setInputEncoding(args[i + 1]);
        op.tlpParams.setOutputEncoding(args[i + 1]);
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    Test.display();
    Train.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (Test.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");
    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer = null;
    if (!Train.leftToRight) {
      binarizer =
          new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true);
    } else {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams.headFinder(),
              new LeftHeadFinder(),
              op.tlpParams,
              op.forceCNF,
              !Train.outsideFactor(),
              true);
    }
    CollinsPuncTransformer collinsPuncTransformer = null;
    if (Train.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (Train.selectiveSplit) {
      Train.splitters =
          ParentAnnotationStats.getSplitCategories(
              trainTreebank,
              Train.tagSelectiveSplit,
              0,
              Train.selectiveSplitCutOff,
              Train.tagSelectiveSplitCutOff,
              op.tlpParams.treebankLanguagePack());
      if (Train.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : Train.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (Train.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      Train.postSplitters =
          ParentAnnotationStats.getSplitCategories(
              annotatedTB,
              true,
              0,
              Train.selectivePostSplitCutOff,
              Train.tagSelectivePostSplitCutOff,
              op.tlpParams.treebankLanguagePack());
    }

    if (Train.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (Train.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        // tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        // binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (Test.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done."); // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    // extract grammars
    Extractor bgExtractor = new BinaryGrammarExtractor();
    // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    // Extractor dgExtractor = new DependencyMemGrammarExtractor();

    Extractor dgExtractor = new MLEDependencyGrammarExtractor(op);
    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair bgug = null;
      if (Train.cheatPCFG) {
        List allTrees = new ArrayList(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = (Pair) bgExtractor.extract(allTrees);
      } else {
        bgug = (Pair) bgExtractor.extract(binaryTrainTrees);
      }
      bg = (BinaryGrammar) bgug.second;
      bg.splitRules();
      ug = (UnaryGrammar) bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    lex = op.tlpParams.lex(op.lexOptions);
    lex.train(binaryTrainTrees);
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams,true));

      DependencyGrammar dg1 =
          (DependencyGrammar)
              dgExtractor.extract(
                  trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
      // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new
      // TransformTreeDependency(tlpParams));

      // dg = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether
      // the words are known or not, discards unknown words
      Timing.tick("done.");
      // System.out.print("Extracting Unknown Word Model...");
      // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      // Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      // System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser.saveParserDataToSerialized(
          new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op);
    }

    ExhaustiveDependencyParser dparser =
        ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null);

    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null);
    // Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser =
          (Test.useN5)
              ? new BiLexPCFGParser.N5BiLexPCFGParser(
                  scorer, parser, dparser, bg, ug, dg, lex, op, gp)
              : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp);
    }

    LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg  PE", true, tlp);
    LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp);
    AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg  CB", true, tlp);

    AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg  TE");
    AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE");
    AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE");

    AbstractEval depDE =
        new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter());
    AbstractEval comboDE =
        new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter());

    if (Test.evalb) {
      EvalB.initEVALBfiles(op.tlpParams);
    }

    // int[] countByLength = new int[Test.maxLength+1];

    // use a reflection ruse, so one can run this without needing the tagger
    // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new
    // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null);
    SentenceProcessor tagger = null;
    if (Test.preTag) {
      try {
        Class[] argsClass = new Class[] {String.class};
        Object[] arguments =
            new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"};
        tagger =
            (SentenceProcessor)
                Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger")
                    .getConstructor(argsClass)
                    .newInstance(arguments);
      } catch (Exception e) {
        System.err.println(e);
        System.err.println("Warning: No pretagging of sentences will be done.");
      }
    }

    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
      Tree tree = testTreebank.get(tNum);
      int testTreeLen = tree.yield().size();
      if (testTreeLen > Test.maxLength) {
        continue;
      }
      Tree binaryTree = binaryTestTrees.get(tNum);
      // countByLength[testTreeLen]++;
      System.out.println("-------------------------------------");
      System.out.println("Number: " + (tNum + 1));
      System.out.println("Length: " + testTreeLen);

      // tree.pennPrint(pw);
      // System.out.println("XXXX The binary tree is");
      // binaryTree.pennPrint(pw);
      // System.out.println("Here are the tags in the lexicon:");
      // System.out.println(lex.showTags());
      // System.out.println("Here's the tagnumberer:");
      // System.out.println(Numberer.getGlobalNumberer("tags").toString());

      long timeMil1 = System.currentTimeMillis();
      Timing.tick("Starting parse.");
      if (op.doPCFG) {
        // System.err.println(Test.forceTags);
        if (Test.forceTags) {
          if (tagger != null) {
            // System.out.println("Using a tagger to set tags");
            // System.out.println("Tagged sentence as: " +
            // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
            parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield())))));
          } else {
            // System.out.println("Forcing tags to match input.");
            parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
          }
        } else {
          // System.out.println("XXXX Parsing " + binaryTree.yield());
          parser.parse(binaryTree.yield());
        }
        // Timing.tick("Done with pcfg phase.");
      }
      if (op.doDep) {
        dparser.parse(binaryTree.yield());
        // Timing.tick("Done with dependency phase.");
      }
      boolean bothPassed = false;
      if (op.doPCFG && op.doDep) {
        bothPassed = bparser.parse(binaryTree.yield());
        // Timing.tick("Done with combination phase.");
      }
      long timeMil2 = System.currentTimeMillis();
      long elapsed = timeMil2 - timeMil1;
      System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
      // System.out.println("PCFG Best Parse:");
      Tree tree2b = null;
      Tree tree2 = null;
      // System.out.println("Got full best parse...");
      if (op.doPCFG) {
        tree2b = parser.getBestParse();
        tree2 = debinarizer.transformTree(tree2b);
      }
      // System.out.println("Debinarized parse...");
      // tree2.pennPrint();
      // System.out.println("DepG Best Parse:");
      Tree tree3 = null;
      Tree tree3db = null;
      if (op.doDep) {
        tree3 = dparser.getBestParse();
        // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
        tree3db = debinarizer.transformTree(tree3);
        tree3.pennPrint(pw);
      }
      // tree.pennPrint();
      // ((Tree)binaryTrainTrees.get(tNum)).pennPrint();
      // System.out.println("Combo Best Parse:");
      Tree tree4 = null;
      if (op.doPCFG && op.doDep) {
        try {
          tree4 = bparser.getBestParse();
          if (tree4 == null) {
            tree4 = tree2b;
          }
        } catch (NullPointerException e) {
          System.err.println("Blocked, using PCFG parse!");
          tree4 = tree2b;
        }
      }
      if (op.doPCFG && !bothPassed) {
        tree4 = tree2b;
      }
      // tree4.pennPrint();
      if (op.doDep) {
        depDE.evaluate(tree3, binaryTree, pw);
        depTE.evaluate(tree3db, tree, pw);
      }
      TreeTransformer tc = op.tlpParams.collinizer();
      TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
      Tree tree4b = null;
      if (op.doPCFG) {
        // System.out.println("XXXX Best PCFG was: ");
        // tree2.pennPrint();
        // System.out.println("XXXX Transformed best PCFG is: ");
        // tc.transformTree(tree2).pennPrint();
        // System.out.println("True Best Parse:");
        // tree.pennPrint();
        // tc.transformTree(tree).pennPrint();
        pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        if (op.doDep) {
          comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
          tree4b = tree4;
          tree4 = debinarizer.transformTree(tree4);
          if (op.nodePrune) {
            NodePruner np = new NodePruner(parser, debinarizer);
            tree4 = np.prune(tree4);
          }
          // tree4.pennPrint();
          comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        // pcfgTE.evaluate(tree2, tree);
        pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
        pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);

        if (op.doDep) {
          comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
          comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

        // tc.transformTree(tree2).pennPrint();
        tree2.pennPrint(pw);

        if (op.doDep) {
          System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
          // tc.transformTree(tree4).pennPrint(pw);
          tree4.pennPrint(pw);
        }
        System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
        /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
        tree.pennPrint(pw);
      } // end if doPCFG

      if (Test.evalb) {
        if (op.doPCFG && op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
        } else if (op.doPCFG) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
        } else if (op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
        }
      }
    } // end for each tree in test treebank

    if (Test.evalb) {
      EvalB.closeEVALBfiles();
    }

    // Test.display();
    if (op.doPCFG) {
      pcfgPE.display(false, pw);
      System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total());
      pcfgCB.display(false, pw);
      if (op.doDep) {
        comboPE.display(false, pw);
      }
      pcfgTE.display(false, pw);
      pcfgTEnoPunct.display(false, pw);
      if (op.doDep) {
        comboTE.display(false, pw);
        comboTEnoPunct.display(false, pw);
      }
    }
    if (op.doDep) {
      depTE.display(false, pw);
      depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
      comboDE.display(false, pw);
    }
    // pcfgPE.printGoodBad();
  }