コード例 #1
0
  public static void main(String[] args) {

    QPTreeTransformer transformer = new QPTreeTransformer();
    Treebank tb = new MemoryTreebank();
    Properties props = StringUtils.argsToProperties(args);
    String treeFileName = props.getProperty("treeFile");

    if (treeFileName != null) {
      try {
        TreeReader tr =
            new PennTreeReader(
                new BufferedReader(new InputStreamReader(new FileInputStream(treeFileName))),
                new LabeledScoredTreeFactory());
        Tree t;
        while ((t = tr.readTree()) != null) {
          tb.add(t);
        }
      } catch (IOException e) {
        throw new RuntimeException("File problem: " + e);
      }
    }

    for (Tree t : tb) {
      System.out.println("Original tree");
      t.pennPrint();
      System.out.println();
      System.out.println("Tree transformed");
      Tree tree = transformer.transformTree(t);
      tree.pennPrint();
      System.out.println();
      System.out.println("----------------------------");
    }
  }
コード例 #2
0
 /**
  * Load a collection of parse trees from a Reader. Each tree may optionally be encased in parens
  * to allow for Penn Treebank style trees.
  *
  * @param r The reader to read trees from. (If you want it buffered, you should already have
  *     buffered it!)
  * @param id An ID for where these files come from (arbitrary, but something like a filename. Can
  *     be <code>null</code> for none.
  */
 public void load(Reader r, String id) {
   try {
     // could throw an IO exception?
     TreeReader tr = treeReaderFactory().newTreeReader(r);
     int sentIndex = 0;
     for (Tree pt; (pt = tr.readTree()) != null; ) {
       if (pt.label() instanceof HasIndex) { // so we can trace where this tree came from
         HasIndex hi = (HasIndex) pt.label();
         if (id != null) {
           hi.setDocID(id);
         }
         hi.setSentIndex(sentIndex);
       }
       parseTrees.add(pt);
       sentIndex++;
     }
   } catch (IOException e) {
     System.err.println("load IO Exception: " + e);
   }
 }
コード例 #3
0
  /**
   * Load a collection of parse trees from the file of given name. Each tree may optionally be
   * encased in parens to allow for Penn Treebank style trees. This methods implements the <code>
   * FileProcessor</code> interface.
   *
   * @param file file to load a tree from
   */
  public void processFile(File file) {
    TreeReader tr = null;

    // SRL stuff
    CollectionValuedMap<Integer, String> srlMap = null;
    if (this.srlMap != null) {
      // there must be a better way ...
      String filename = file.getAbsolutePath();
      for (String suffix : this.srlMap.keySet()) {
        if (filename.endsWith(suffix)) {
          srlMap = this.srlMap.get(suffix);
          break;
        }
      }
      if (srlMap == null) {
        System.err.println("could not find SRL entries for file: " + file);
      }
    }

    try {
      // maybe print file name to stdout to get some feedback
      if (PRINT_FILENAMES) {
        System.err.println(file);
      }
      // could throw an IO exception if can't open for reading
      tr =
          treeReaderFactory()
              .newTreeReader(
                  new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding())));
      int sentIndex = 0;
      Tree pt;
      while ((pt = tr.readTree()) != null) {
        if (pt.label() instanceof HasIndex) { // so we can trace where this tree came from
          HasIndex hi = (HasIndex) pt.label();
          hi.setDocID(file.getName());
          hi.setSentIndex(sentIndex);
        }
        if (srlMap == null) {
          parseTrees.add(pt);
        } else {
          Collection<String> srls = srlMap.get(sentIndex);
          //           pt.pennPrint();
          //           System.err.println(srls);
          parseTrees.add(pt);
          if (srls.isEmpty()) {
            //            parseTrees.add(pt);
          } else {
            for (String srl : srls) {
              //              Tree t = pt.deepCopy();
              String[] bits = srl.split("\\s+");
              int verbIndex = Integer.parseInt(bits[0]);
              String lemma = bits[2].split("\\.")[0];
              //              Tree verb = Trees.getTerminal(t, verbIndex);
              Tree verb = Trees.getTerminal(pt, verbIndex);
              //              ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL);
              ((CoreLabel) verb.label()).set(CoreAnnotations.CoNLLPredicateAnnotation.class, true);
              for (int i = 4; i < bits.length; i++) {
                String arg = bits[i];
                String[] bits1;
                if (arg.indexOf("ARGM") >= 0) {
                  bits1 = arg.split("-");
                } else {
                  bits1 = arg.split("-");
                }
                String locs = bits1[0];
                String argType = bits1[1];
                if (argType.equals("rel")) {
                  continue;
                }
                for (String loc : locs.split("[*,]")) {
                  bits1 = loc.split(":");
                  int term = Integer.parseInt(bits1[0]);
                  int height = Integer.parseInt(bits1[1]);
                  //                  Tree t1 = Trees.getPreTerminal(t, term);
                  Tree t1 = Trees.getPreTerminal(pt, term);
                  for (int j = 0; j < height; j++) {
                    //                    t1 = t1.parent(t);
                    t1 = t1.parent(pt);
                  }
                  Map<Integer, String> roleMap =
                      ((CoreLabel) t1.label()).get(CoreAnnotations.CoNLLSRLAnnotation.class);
                  if (roleMap == null) {
                    roleMap = new HashMap<Integer, String>();
                    ((CoreLabel) t1.label()).set(CoreAnnotations.CoNLLSRLAnnotation.class, roleMap);
                  }
                  roleMap.put(verbIndex, argType);
                  //                  ((CoreLabel)t1.label()).set(SRLIDAnnotation.class,
                  // SRL_ID.ARG);
                }
              }
              //               for (Tree t1 : t) {
              //                 if (t1.isLeaf()) { continue; }
              //                 CoreLabel fl = (CoreLabel)t1.label();
              //                 if (fl.value() == null) { continue; }
              //                 if (!fl.has(SRLIDAnnotation.class)) {
              //                   boolean allNone = true;
              //                   for (Tree t2 : t1) {
              //                     SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class);
              //                     if (s == SRL_ID.ARG || s == SRL_ID.REL) {
              //                       allNone = false;
              //                       break;
              //                     }
              //                   }
              //                   if (allNone) {
              //                     fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO);
              //                   } else {
              //                     fl.set(SRLIDAnnotation.class, SRL_ID.NO);
              //                   }
              //                 }
              //               }
              //              parseTrees.add(t);
            }
          }
        }

        sentIndex++;
      }
    } catch (IOException e) {
      System.err.println("loadTree IO Exception: " + e + " in file " + file);
    } finally {
      try {
        if (tr != null) {
          tr.close(); // important: closes file even if error!
        }
      } catch (IOException e) {
        // do nothin'
      }
    }
  }
コード例 #4
0
ファイル: TregexPattern.java プロジェクト: hans/CoreNLP
  /**
   * Prints out all matches of a tree pattern on each tree in the path. Usage: <br>
   * <br>
   * <code>
   * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h &lt;node-name&gt;]]* pattern
   *  filepath   </code>
   *
   * <p>Arguments:<br>
   *
   * <ul>
   *   <li><code>pattern</code>: the tree pattern which optionally names some set of nodes (i.e.,
   *       gives it the "handle") <code>=name</code> (for some arbitrary string "name")
   *   <li><code>filepath</code>: the path to files with trees. If this is a directory, there will
   *       be recursive descent and the pattern will be run on all files beneath the specified
   *       directory.
   * </ul>
   *
   * <p>Options:<br>
   * <li><code>-C</code> suppresses printing of matches, so only the number of matches is printed.
   * <li><code>-w</code> causes the whole of a tree that matches to be printed.
   * <li><code>-f</code> causes the filename to be printed.
   * <li><code>-i &lt;filename&gt;</code> causes the pattern to be matched to be read from <code>
   *     &lt;filename&gt;</code> rather than the command line. Don't specify a pattern when this
   *     option is used.
   * <li><code>-o</code> Specifies that each tree node can be reported only once as the root of a
   *     match (by default a node will be printed once for every <em>way</em> the pattern matches).
   * <li><code>-s</code> causes trees to be printed all on one line (by default they are pretty
   *     printed).
   * <li><code>-n</code> causes the number of the tree in which the match was found to be printed
   *     before every match.
   * <li><code>-u</code> causes only the label of each matching node to be printed, not complete
   *     subtrees.
   * <li><code>-t</code> causes only the yield (terminal words) of the selected node to be printed
   *     (or the yield of the whole tree, if the <code>-w</code> option is used).
   * <li><code>-encoding &lt;charset_encoding&gt;</code> option allows specification of character
   *     encoding of trees..
   * <li><code>-h &lt;node-handle&gt;</code> If a <code>-h</code> option is given, the root tree
   *     node will not be printed. Instead, for each <code>node-handle</code> specified, the node
   *     matched and given that handle will be printed. Multiple nodes can be printed by using the
   *     <code>-h</code> option multiple times on a single command line.
   * <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *     to determine headship relations.
   * <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *     class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *     arguments.
   * <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *     TreeReaderFactory} class to read trees from files.
   * <li><code>-v</code> print every tree that contains no matches of the specified pattern, but
   *     print no matches to the pattern.
   * <li><code>-x</code> Instead of the matched subtree, print the matched subtree's identifying
   *     number as defined in <tt>tgrep2</tt>:a unique identifier for the subtree and is in the form
   *     s:n, where s is an integer specifying the sentence number in the corpus (starting with 1),
   *     and n is an integer giving the order in which the node is encountered in a depth-first
   *     search starting with 1 at top node in the sentence tree.
   * <li><code>-extract &lt;code&gt; &lt;tree-file&gt;</code> extracts the subtree s:n specified by
   *     <tt>code</tt> from the specified <tt>tree-file</tt>. Overrides all other behavior of
   *     tregex. Can't specify multiple encodings etc. yet.
   * <li><code>-extractFile &lt;code-file&gt; &lt;tree-file&gt;</code> extracts every subtree
   *     specified by the subtree codes in <tt>code-file</tt>, which must appear exactly one per
   *     line, from the specified <tt>tree-file</tt>. Overrides all other behavior of tregex. Can't
   *     specify multiple encodings etc. yet.
   * <li><code>-filter</code> causes this to act as a filter, reading tree input from stdin
   * <li><code>-T</code> causes all trees to be printed as processed (for debugging purposes).
   *     Otherwise only matching nodes are printed.
   * <li><code>-macros &lt;filename&gt;</code> filename with macro substitutions to use. file with
   *     tab separated lines original-tab-replacement
   * </ul>
   */
  public static void main(String[] args) throws IOException {
    Timing.startTime();

    StringBuilder treePrintFormats = new StringBuilder();
    String printNonMatchingTreesOption = "-v";
    String subtreeCodeOption = "-x";
    String extractSubtreesOption = "-extract";
    String extractSubtreesFileOption = "-extractFile";
    String inputFileOption = "-i";
    String headFinderOption = "-hf";
    String headFinderArgOption = "-hfArg";
    String trfOption = "-trf";
    String headFinderClassName = null;
    String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY;
    String treeReaderFactoryClassName = null;
    String printHandleOption = "-h";
    String markHandleOption = "-k";
    String encodingOption = "-encoding";
    String encoding = "UTF-8";
    String macroOption = "-macros";
    String macroFilename = "";
    String yieldOnly = "-t";
    String printAllTrees = "-T";
    String quietMode = "-C";
    String wholeTreeMode = "-w";
    String filenameOption = "-f";
    String oneMatchPerRootNodeMode = "-o";
    String reportTreeNumbers = "-n";
    String rootLabelOnly = "-u";
    String oneLine = "-s";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(extractSubtreesOption, 2);
    flagMap.put(extractSubtreesFileOption, 2);
    flagMap.put(subtreeCodeOption, 0);
    flagMap.put(printNonMatchingTreesOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(inputFileOption, 1);
    flagMap.put(printHandleOption, 1);
    flagMap.put(markHandleOption, 2);
    flagMap.put(headFinderOption, 1);
    flagMap.put(headFinderArgOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(macroOption, 1);
    flagMap.put(yieldOnly, 0);
    flagMap.put(quietMode, 0);
    flagMap.put(wholeTreeMode, 0);
    flagMap.put(printAllTrees, 0);
    flagMap.put(filenameOption, 0);
    flagMap.put(oneMatchPerRootNodeMode, 0);
    flagMap.put(reportTreeNumbers, 0);
    flagMap.put(rootLabelOnly, 0);
    flagMap.put(oneLine, 0);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(encodingOption)) {
      encoding = argsMap.get(encodingOption)[0];
      System.err.println("Encoding set to " + encoding);
    }
    PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true);

    if (argsMap.containsKey(extractSubtreesOption)) {
      List<String> subTreeStrings =
          Collections.singletonList(argsMap.get(extractSubtreesOption)[0]);
      extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]);
      return;
    }
    if (argsMap.containsKey(extractSubtreesFileOption)) {
      List<String> subTreeStrings =
          Arrays.asList(
              IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r"));
      extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]);
      return;
    }

    if (args.length < 1) {
      errPW.println(
          "Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-f] [-o] [-n] [-s] [-filter]  [-hf class] [-trf class] [-h handle]* pattern [filepath]");
      return;
    }
    String matchString = args[0];

    if (argsMap.containsKey(macroOption)) {
      macroFilename = argsMap.get(macroOption)[0];
    }
    if (argsMap.containsKey(headFinderOption)) {
      headFinderClassName = argsMap.get(headFinderOption)[0];
      errPW.println("Using head finder " + headFinderClassName + "...");
    }
    if (argsMap.containsKey(headFinderArgOption)) {
      headFinderArgs = argsMap.get(headFinderArgOption);
    }
    if (argsMap.containsKey(trfOption)) {
      treeReaderFactoryClassName = argsMap.get(trfOption)[0];
      errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "...");
    }
    if (argsMap.containsKey(printAllTrees)) {
      TRegexTreeVisitor.printTree = true;
    }
    if (argsMap.containsKey(inputFileOption)) {
      String inputFile = argsMap.get(inputFileOption)[0];
      matchString = IOUtils.slurpFile(inputFile, encoding);
      String[] newArgs = new String[args.length + 1];
      System.arraycopy(args, 0, newArgs, 1, args.length);
      args = newArgs;
    }
    if (argsMap.containsKey(quietMode)) {
      TRegexTreeVisitor.printMatches = false;
      TRegexTreeVisitor.printNumMatchesToStdOut = true;
    }
    if (argsMap.containsKey(printNonMatchingTreesOption)) {
      TRegexTreeVisitor.printNonMatchingTrees = true;
    }
    if (argsMap.containsKey(subtreeCodeOption)) {
      TRegexTreeVisitor.printSubtreeCode = true;
      TRegexTreeVisitor.printMatches = false;
    }
    if (argsMap.containsKey(wholeTreeMode)) {
      TRegexTreeVisitor.printWholeTree = true;
    }
    if (argsMap.containsKey(filenameOption)) {
      TRegexTreeVisitor.printFilename = true;
    }
    if (argsMap.containsKey(oneMatchPerRootNodeMode)) TRegexTreeVisitor.oneMatchPerRootNode = true;
    if (argsMap.containsKey(reportTreeNumbers)) TRegexTreeVisitor.reportTreeNumbers = true;
    if (argsMap.containsKey(rootLabelOnly)) {
      treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(',');
    } else if (argsMap.containsKey(oneLine)) { // display short form
      treePrintFormats.append("oneline,");
    } else if (argsMap.containsKey(yieldOnly)) {
      treePrintFormats.append("words,");
    } else {
      treePrintFormats.append("penn,");
    }

    HeadFinder hf = new CollinsHeadFinder();
    if (headFinderClassName != null) {
      Class[] hfArgClasses = new Class[headFinderArgs.length];
      for (int i = 0; i < hfArgClasses.length; i++) hfArgClasses[i] = String.class;
      try {
        hf =
            (HeadFinder)
                Class.forName(headFinderClassName)
                    .getConstructor(hfArgClasses)
                    .newInstance(
                        (Object[])
                            headFinderArgs); // cast to Object[] necessary to avoid varargs-related
        // warning.
      } catch (Exception e) {
        throw new RuntimeException("Error occurred while constructing HeadFinder: " + e);
      }
    }

    TRegexTreeVisitor.tp =
        new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack());

    try {
      // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``");
      TregexPatternCompiler tpc = new TregexPatternCompiler(hf);
      Macros.addAllMacros(tpc, macroFilename, encoding);
      TregexPattern p = tpc.compile(matchString);
      errPW.println("Pattern string:\n" + p.pattern());
      errPW.println("Parsed representation:");
      p.prettyPrint(errPW);

      String[] handles = argsMap.get(printHandleOption);
      if (argsMap.containsKey("-filter")) {
        TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
        treebank =
            new MemoryTreebank(
                trf, encoding); // has to be in memory since we're not storing it on disk
        // read from stdin
        Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding));
        ((MemoryTreebank) treebank).load(reader);
        reader.close();
      } else if (args.length == 1) {
        errPW.println("using default tree");
        TreeReader r =
            new PennTreeReader(
                new StringReader(
                    "(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"),
                new LabeledScoredTreeFactory(new StringLabelFactory()));
        Tree t = r.readTree();
        treebank = new MemoryTreebank();
        treebank.add(t);
      } else {
        int last = args.length - 1;
        errPW.println("Reading trees from file(s) " + args[last]);
        TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName);
        treebank = new DiskTreebank(trf, encoding);
        treebank.loadPath(args[last], null, true);
      }
      TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding);

      treebank.apply(vis);
      Timing.endTime();
      if (TRegexTreeVisitor.printMatches) {
        errPW.println("There were " + vis.numMatches() + " matches in total.");
      }
      if (TRegexTreeVisitor.printNumMatchesToStdOut) {
        System.out.println(vis.numMatches());
      }
    } catch (IOException e) {
      e.printStackTrace();
    } catch (TregexParseException e) {
      errPW.println("Error parsing expression: " + args[0]);
      errPW.println("Parse exception: " + e.toString());
    }
  }