Java MemoryTreebank Examples

Programming Language: Java

Namespace/Package Name: java.util

Class/Type: MemoryTreebank

Examples at hotexamples.com: 3

Java MemoryTreebank - 3 examples found. These are the top rated real world Java examples of java.util.MemoryTreebank extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get(2)

loadPath(2)

load(1)

size(1)

Example #1

Show file

File: TregexPattern.java Project: hans/CoreNLP

 private static void extractSubtrees(List<String> codeStrings, String treeFile) {
   List<Pair<Integer, Integer>> codes = new ArrayList<Pair<Integer, Integer>>();
   for (String s : codeStrings) {
     Matcher m = codePattern.matcher(s);
     if (m.matches())
       codes.add(
           new Pair<Integer, Integer>(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))));
     else throw new RuntimeException("Error: illegal node code " + s);
   }
   TreeReaderFactory trf = new TRegexTreeReaderFactory();
   MemoryTreebank treebank = new MemoryTreebank(trf);
   treebank.loadPath(treeFile, null, true);
   for (Pair<Integer, Integer> code : codes) {
     Tree t = treebank.get(code.first() - 1);
     t.getNodeNumber(code.second()).pennPrint();
   }
 }

Example #2

Show file

File: FactoredParser.java Project: renaud/maven_repo

  public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.

    System.out.println("Currently " + new Date());
    System.out.print("Invoked with arguments:");
    for (String arg : args) {
      System.out.print(" " + arg);
    }
    System.out.println();

    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
        serializeFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
        try {
          op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
        } catch (ClassNotFoundException e) {
          System.err.println("Class not found: " + args[i + 1]);
        } catch (InstantiationException e) {
          System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
        } catch (IllegalAccessException e) {
          System.err.println("illegal access" + e);
        }
        i += 2;
      } else if (args[i].equals("-encoding")) {
        // sets encoding for TreebankLangParserParams
        op.tlpParams.setInputEncoding(args[i + 1]);
        op.tlpParams.setOutputEncoding(args[i + 1]);
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    Test.display();
    Train.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (Test.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");
    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer = null;
    if (!Train.leftToRight) {
      binarizer =
          new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true);
    } else {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams.headFinder(),
              new LeftHeadFinder(),
              op.tlpParams,
              op.forceCNF,
              !Train.outsideFactor(),
              true);
    }
    CollinsPuncTransformer collinsPuncTransformer = null;
    if (Train.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (Train.selectiveSplit) {
      Train.splitters =
          ParentAnnotationStats.getSplitCategories(
              trainTreebank,
              Train.tagSelectiveSplit,
              0,
              Train.selectiveSplitCutOff,
              Train.tagSelectiveSplitCutOff,
              op.tlpParams.treebankLanguagePack());
      if (Train.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : Train.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (Train.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      Train.postSplitters =
          ParentAnnotationStats.getSplitCategories(
              annotatedTB,
              true,
              0,
              Train.selectivePostSplitCutOff,
              Train.tagSelectivePostSplitCutOff,
              op.tlpParams.treebankLanguagePack());
    }

    if (Train.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (Train.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        // tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        // binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (Test.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done."); // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    // extract grammars
    Extractor bgExtractor = new BinaryGrammarExtractor();
    // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    // Extractor dgExtractor = new DependencyMemGrammarExtractor();

    Extractor dgExtractor = new MLEDependencyGrammarExtractor(op);
    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair bgug = null;
      if (Train.cheatPCFG) {
        List allTrees = new ArrayList(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = (Pair) bgExtractor.extract(allTrees);
      } else {
        bgug = (Pair) bgExtractor.extract(binaryTrainTrees);
      }
      bg = (BinaryGrammar) bgug.second;
      bg.splitRules();
      ug = (UnaryGrammar) bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    lex = op.tlpParams.lex(op.lexOptions);
    lex.train(binaryTrainTrees);
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams,true));

      DependencyGrammar dg1 =
          (DependencyGrammar)
              dgExtractor.extract(
                  trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
      // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new
      // TransformTreeDependency(tlpParams));

      // dg = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether
      // the words are known or not, discards unknown words
      Timing.tick("done.");
      // System.out.print("Extracting Unknown Word Model...");
      // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      // Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      // System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser.saveParserDataToSerialized(
          new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op);
    }

    ExhaustiveDependencyParser dparser =
        ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null);

    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null);
    // Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser =
          (Test.useN5)
              ? new BiLexPCFGParser.N5BiLexPCFGParser(
                  scorer, parser, dparser, bg, ug, dg, lex, op, gp)
              : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp);
    }

    LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg  PE", true, tlp);
    LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp);
    AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg  CB", true, tlp);

    AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg  TE");
    AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE");
    AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE");

    AbstractEval depDE =
        new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter());
    AbstractEval comboDE =
        new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter());

    if (Test.evalb) {
      EvalB.initEVALBfiles(op.tlpParams);
    }

    // int[] countByLength = new int[Test.maxLength+1];

    // use a reflection ruse, so one can run this without needing the tagger
    // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new
    // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null);
    SentenceProcessor tagger = null;
    if (Test.preTag) {
      try {
        Class[] argsClass = new Class[] {String.class};
        Object[] arguments =
            new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"};
        tagger =
            (SentenceProcessor)
                Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger")
                    .getConstructor(argsClass)
                    .newInstance(arguments);
      } catch (Exception e) {
        System.err.println(e);
        System.err.println("Warning: No pretagging of sentences will be done.");
      }
    }

    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
      Tree tree = testTreebank.get(tNum);
      int testTreeLen = tree.yield().size();
      if (testTreeLen > Test.maxLength) {
        continue;
      }
      Tree binaryTree = binaryTestTrees.get(tNum);
      // countByLength[testTreeLen]++;
      System.out.println("-------------------------------------");
      System.out.println("Number: " + (tNum + 1));
      System.out.println("Length: " + testTreeLen);

      // tree.pennPrint(pw);
      // System.out.println("XXXX The binary tree is");
      // binaryTree.pennPrint(pw);
      // System.out.println("Here are the tags in the lexicon:");
      // System.out.println(lex.showTags());
      // System.out.println("Here's the tagnumberer:");
      // System.out.println(Numberer.getGlobalNumberer("tags").toString());

      long timeMil1 = System.currentTimeMillis();
      Timing.tick("Starting parse.");
      if (op.doPCFG) {
        // System.err.println(Test.forceTags);
        if (Test.forceTags) {
          if (tagger != null) {
            // System.out.println("Using a tagger to set tags");
            // System.out.println("Tagged sentence as: " +
            // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
            parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield())))));
          } else {
            // System.out.println("Forcing tags to match input.");
            parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
          }
        } else {
          // System.out.println("XXXX Parsing " + binaryTree.yield());
          parser.parse(binaryTree.yield());
        }
        // Timing.tick("Done with pcfg phase.");
      }
      if (op.doDep) {
        dparser.parse(binaryTree.yield());
        // Timing.tick("Done with dependency phase.");
      }
      boolean bothPassed = false;
      if (op.doPCFG && op.doDep) {
        bothPassed = bparser.parse(binaryTree.yield());
        // Timing.tick("Done with combination phase.");
      }
      long timeMil2 = System.currentTimeMillis();
      long elapsed = timeMil2 - timeMil1;
      System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
      // System.out.println("PCFG Best Parse:");
      Tree tree2b = null;
      Tree tree2 = null;
      // System.out.println("Got full best parse...");
      if (op.doPCFG) {
        tree2b = parser.getBestParse();
        tree2 = debinarizer.transformTree(tree2b);
      }
      // System.out.println("Debinarized parse...");
      // tree2.pennPrint();
      // System.out.println("DepG Best Parse:");
      Tree tree3 = null;
      Tree tree3db = null;
      if (op.doDep) {
        tree3 = dparser.getBestParse();
        // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
        tree3db = debinarizer.transformTree(tree3);
        tree3.pennPrint(pw);
      }
      // tree.pennPrint();
      // ((Tree)binaryTrainTrees.get(tNum)).pennPrint();
      // System.out.println("Combo Best Parse:");
      Tree tree4 = null;
      if (op.doPCFG && op.doDep) {
        try {
          tree4 = bparser.getBestParse();
          if (tree4 == null) {
            tree4 = tree2b;
          }
        } catch (NullPointerException e) {
          System.err.println("Blocked, using PCFG parse!");
          tree4 = tree2b;
        }
      }
      if (op.doPCFG && !bothPassed) {
        tree4 = tree2b;
      }
      // tree4.pennPrint();
      if (op.doDep) {
        depDE.evaluate(tree3, binaryTree, pw);
        depTE.evaluate(tree3db, tree, pw);
      }
      TreeTransformer tc = op.tlpParams.collinizer();
      TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
      Tree tree4b = null;
      if (op.doPCFG) {
        // System.out.println("XXXX Best PCFG was: ");
        // tree2.pennPrint();
        // System.out.println("XXXX Transformed best PCFG is: ");
        // tc.transformTree(tree2).pennPrint();
        // System.out.println("True Best Parse:");
        // tree.pennPrint();
        // tc.transformTree(tree).pennPrint();
        pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        if (op.doDep) {
          comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
          tree4b = tree4;
          tree4 = debinarizer.transformTree(tree4);
          if (op.nodePrune) {
            NodePruner np = new NodePruner(parser, debinarizer);
            tree4 = np.prune(tree4);
          }
          // tree4.pennPrint();
          comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        // pcfgTE.evaluate(tree2, tree);
        pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
        pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);

        if (op.doDep) {
          comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
          comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

        // tc.transformTree(tree2).pennPrint();
        tree2.pennPrint(pw);

        if (op.doDep) {
          System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
          // tc.transformTree(tree4).pennPrint(pw);
          tree4.pennPrint(pw);
        }
        System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
        /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
        tree.pennPrint(pw);
      } // end if doPCFG

      if (Test.evalb) {
        if (op.doPCFG && op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
        } else if (op.doPCFG) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
        } else if (op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
        }
      }
    } // end for each tree in test treebank

    if (Test.evalb) {
      EvalB.closeEVALBfiles();
    }

    // Test.display();
    if (op.doPCFG) {
      pcfgPE.display(false, pw);
      System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total());
      pcfgCB.display(false, pw);
      if (op.doDep) {
        comboPE.display(false, pw);
      }
      pcfgTE.display(false, pw);
      pcfgTEnoPunct.display(false, pw);
      if (op.doDep) {
        comboTE.display(false, pw);
        comboTEnoPunct.display(false, pw);
      }
    }
    if (op.doDep) {
      depTE.display(false, pw);
      depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
      comboDE.display(false, pw);
    }
    // pcfgPE.printGoodBad();
  }

Example #3

Show file

File: TregexPattern.java Project: hans/CoreNLP

  /**
   * Prints out all matches of a tree pattern on each tree in the path. Usage: <br>
   * <br>
   * <code>
   * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h &lt;node-name&gt;]]* pattern
   *  filepath   </code>
   *
   * <p>Arguments:<br>
   *
   * <ul>
   *   <li><code>pattern</code>: the tree pattern which optionally names some set of nodes (i.e.,
   *       gives it the "handle") <code>=name</code> (for some arbitrary string "name")
   *   <li><code>filepath</code>: the path to files with trees. If this is a directory, there will
   *       be recursive descent and the pattern will be run on all files beneath the specified
   *       directory.
   * </ul>
   *
   * <p>Options:<br>
   * <li><code>-C</code> suppresses printing of matches, so only the number of matches is printed.
   * <li><code>-w</code> causes the whole of a tree that matches to be printed.
   * <li><code>-f</code> causes the filename to be printed.
   * <li><code>-i &lt;filename&gt;</code> causes the pattern to be matched to be read from <code>
   *     &lt;filename&gt;</code> rather than the command line. Don't specify a pattern when this
   *     option is used.
   * <li><code>-o</code> Specifies that each tree node can be reported only once as the root of a
   *     match (by default a node will be printed once for every <em>way</em> the pattern matches).
   * <li><code>-s</code> causes trees to be printed all on one line (by default they are pretty
   *     printed).
   * <li><code>-n</code> causes the number of the tree in which the match was found to be printed
   *     before every match.
   * <li><code>-u</code> causes only the label of each matching node to be printed, not complete
   *     subtrees.
   * <li><code>-t</code> causes only the yield (terminal words) of the selected node to be printed
   *     (or the yield of the whole tree, if the <code>-w</code> option is used).
   * <li><code>-encoding &lt;charset_encoding&gt;</code> option allows specification of character
   *     encoding of trees..
   * <li><code>-h &lt;node-handle&gt;</code> If a <code>-h</code> option is given, the root tree
   *     node will not be printed. Instead, for each <code>node-handle</code> specified, the node
   *     matched and given that handle will be printed. Multiple nodes can be printed by using the
   *     <code>-h</code> option multiple times on a single command line.
   * <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *     to determine headship relations.
   * <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *     class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *     arguments.
   * <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *     TreeReaderFactory} class to read trees from files.
   * <li><code>-v</code> print every tree that contains no matches of the specified pattern, but
   *     print no matches to the pattern.
   * <li><code>-x</code> Instead of the matched subtree, print the matched subtree's identifying
   *     number as defined in <tt>tgrep2</tt>:a unique identifier for the subtree and is in the form
   *     s:n, where s is an integer specifying the sentence number in the corpus (starting with 1),
   *     and n is an integer giving the order in which the node is encountered in a depth-first
   *     search starting with 1 at top node in the sentence tree.
   * <li><code>-extract &lt;code&gt; &lt;tree-file&gt;</code> extracts the subtree s:n specified by
   *     <tt>code</tt> from the specified <tt>tree-file</tt>. Overrides all other behavior of
   *     tregex. Can't specify multiple encodings etc. yet.
   * <li><code>-extractFile &lt;code-file&gt; &lt;tree-file&gt;</code> extracts every subtree
   *     specified by the subtree codes in <tt>code-file</tt>, which must appear exactly one per
   *     line, from the specified <tt>tree-file</tt>. Overrides all other behavior of tregex. Can't
   *     specify multiple encodings etc. yet.
   * <li><code>-filter</code> causes this to act as a filter, reading tree input from stdin
   * <li><code>-T</code> causes all trees to be printed as processed (for debugging purposes).
   *     Otherwise only matching nodes are printed.
   * <li><code>-macros &lt;filename&gt;</code> filename with macro substitutions to use. file with
   *     tab separated lines original-tab-replacement
   * </ul>
   */
  public static void main(String[] args) throws IOException {
    Timing.startTime();

    StringBuilder treePrintFormats = new StringBuilder();
    String printNonMatchingTreesOption = "-v";
    String subtreeCodeOption = "-x";
    String extractSubtreesOption = "-extract";
    String extractSubtreesFileOption = "-extractFile";
    String inputFileOption = "-i";
    String headFinderOption = "-hf";
    String headFinderArgOption = "-hfArg";
    String trfOption = "-trf";
    String headFinderClassName = null;
    String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY;
    String treeReaderFactoryClassName = null;
    String printHandleOption = "-h";
    String markHandleOption = "-k";
    String encodingOption = "-encoding";
    String encoding = "UTF-8";
    String macroOption = "-macros";
    String macroFilename = "";
    String yieldOnly = "-t";
    String printAllTrees = "-T";
    String quietMode = "-C";
    String wholeTreeMode = "-w";
    String filenameOption = "-f";
    String oneMatchPerRootNodeMode = "-o";
    String reportTreeNumbers = "-n";
    String rootLabelOnly = "-u";
    String oneLine = "-s";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(extractSubtreesOption, 2);
    flagMap.put(extractSubtreesFileOption, 2);
    flagMap.put(subtreeCodeOption, 0);
    flagMap.put(printNonMatchingTreesOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(inputFileOption, 1);
    flagMap.put(printHandleOption, 1);
    flagMap.put(markHandleOption, 2);
    flagMap.put(headFinderOption, 1);
    flagMap.put(headFinderArgOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(macroOption, 1);
    flagMap.put(yieldOnly, 0);
    flagMap.put(quietMode, 0);
    flagMap.put(wholeTreeMode, 0);
    flagMap.put(printAllTrees, 0);
    flagMap.put(filenameOption, 0);
    flagMap.put(oneMatchPerRootNodeMode, 0);
    flagMap.put(reportTreeNumbers, 0);
    flagMap.put(rootLabelOnly, 0);
    flagMap.put(oneLine, 0);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(encodingOption)) {
      encoding = argsMap.get(encodingOption)[0];
      System.err.println("Encoding set to " + encoding);
    }
    PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true);

    if (argsMap.containsKey(extractSubtreesOption)) {
      List<String> subTreeStrings =
          Collections.singletonList(argsMap.get(extractSubtreesOption)[0]);
      extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]);
      return;
    }
    if (argsMap.containsKey(extractSubtreesFileOption)) {
      List<String> subTreeStrings =
          Arrays.asList(
              IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r"));
      extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]);
      return;
    }

    if (args.length < 1) {
      errPW.println(
          "Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-f] [-o] [-n] [-s] [-filter]  [-hf class] [-trf class] [-h handle]* pattern [filepath]");
      return;
    }
    String matchString = args[0];

    if (argsMap.containsKey(macroOption)) {
      macroFilename = argsMap.get(macroOption)[0];
    }
    if (argsMap.containsKey(headFinderOption)) {
      headFinderClassName = argsMap.get(headFinderOption)[0];
      errPW.println("Using head finder " + headFinderClassName + "...");
    }
    if (argsMap.containsKey(headFinderArgOption)) {
      headFinderArgs = argsMap.get(headFinderArgOption);
    }
    if (argsMap.containsKey(trfOption)) {
      treeReaderFactoryClassName = argsMap.get(trfOption)[0];
      errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "...");
    }
    if (argsMap.containsKey(printAllTrees)) {
      TRegexTreeVisitor.printTree = true;
    }
    if (argsMap.containsKey(inputFileOption)) {
      String inputFile = argsMap.get(inputFileOption)[0];
      matchString = IOUtils.slurpFile(inputFile, encoding);
      String[] newArgs = new String[args.length + 1];
      System.arraycopy(args, 0, newArgs, 1, args.length);
      args = newArgs;
    }
    if (argsMap.containsKey(quietMode)) {
      TRegexTreeVisitor.printMatches = false;
      TRegexTreeVisitor.printNumMatchesToStdOut = true;
    }
    if (argsMap.containsKey(printNonMatchingTreesOption)) {
      TRegexTreeVisitor.printNonMatchingTrees = true;
    }
    if (argsMap.containsKey(subtreeCodeOption)) {
      TRegexTreeVisitor.printSubtreeCode = true;
      TRegexTreeVisitor.printMatches = false;
    }
    if (argsMap.containsKey(wholeTreeMode)) {
      TRegexTreeVisitor.printWholeTree = true;
    }
    if (argsMap.containsKey(filenameOption)) {
      TRegexTreeVisitor.printFilename = true;
    }
    if (argsMap.containsKey(oneMatchPerRootNodeMode)) TRegexTreeVisitor.oneMatchPerRootNode = true;
    if (argsMap.containsKey(reportTreeNumbers)) TRegexTreeVisitor.reportTreeNumbers = true;
    if (argsMap.containsKey(rootLabelOnly)) {
      treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(',');
    } else if (argsMap.containsKey(oneLine)) { // display short form
      treePrintFormats.append("oneline,");
    } else if (argsMap.containsKey(yieldOnly)) {
      treePrintFormats.append("words,");
    } else {
      treePrintFormats.append("penn,");
    }

    HeadFinder hf = new CollinsHeadFinder();
    if (headFinderClassName != null) {
      Class[] hfArgClasses = new Class[headFinderArgs.length];
      for (int i = 0; i < hfArgClasses.length; i++) hfArgClasses[i] = String.class;
      try {
        hf =
            (HeadFinder)
                Class.forName(headFinderClassName)
                    .getConstructor(hfArgClasses)
                    .newInstance(
                        (Object[])
                            headFinderArgs); // cast to Object[] necessary to avoid varargs-related
        // warning.
      } catch (Exception e) {
        throw new RuntimeException("Error occurred while constructing HeadFinder: " + e);
      }
    }

    TRegexTreeVisitor.tp =
        new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack());

    try {
      // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``");
      TregexPatternCompiler tpc = new TregexPatternCompiler(hf);
      Macros.addAllMacros(tpc, macroFilename, encoding);
      TregexPattern p = tpc.compile(matchString);
      errPW.println("Pattern string:\n" + p.pattern());
      errPW.println("Parsed representation:");
      p.prettyPrint(errPW);

      String[] handles = argsMap.get(printHandleOption);
      if (argsMap.containsKey("-filter")) {
        TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
        treebank =
            new MemoryTreebank(
                trf, encoding); // has to be in memory since we're not storing it on disk
        // read from stdin
        Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding));
        ((MemoryTreebank) treebank).load(reader);
        reader.close();
      } else if (args.length == 1) {
        errPW.println("using default tree");
        TreeReader r =
            new PennTreeReader(
                new StringReader(
                    "(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"),
                new LabeledScoredTreeFactory(new StringLabelFactory()));
        Tree t = r.readTree();
        treebank = new MemoryTreebank();
        treebank.add(t);
      } else {
        int last = args.length - 1;
        errPW.println("Reading trees from file(s) " + args[last]);
        TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
        treebank = new DiskTreebank(trf, encoding);
        treebank.loadPath(args[last], null, true);
      }
      TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding);

      treebank.apply(vis);
      Timing.endTime();
      if (TRegexTreeVisitor.printMatches) {
        errPW.println("There were " + vis.numMatches() + " matches in total.");
      }
      if (TRegexTreeVisitor.printNumMatchesToStdOut) {
        System.out.println(vis.numMatches());
      }
    } catch (IOException e) {
      e.printStackTrace();
    } catch (TregexParseException e) {
      errPW.println("Error parsing expression: " + args[0]);
      errPW.println("Parse exception: " + e.toString());
    }
  }