예제 #1
0
  /**
   * Builds Jet.Tipster.Document object from Penn treebank corpus.
   *
   * @param in
   * @return
   * @throws IOException
   * @throws InvalidFormatException
   */
  public Treebank load(Reader in) throws IOException, InvalidFormatException {

    List<ParseTreeNode> trees = new ArrayList<ParseTreeNode>();
    PushbackReader input = new PushbackReader(in);

    int start = 0;
    while (true) {
      skipWhitespace(input);
      if (lookAhead(input) == -1) {
        break;
      }

      ParseTreeNode tree = readNode(input);
      trees.add(tree);
      determineSpans(tree, start);
      setAnnotations(tree, null);
      start = tree.end;
    }

    String text = buildDocumentString(trees);
    Document doc = new Document(text);
    for (ParseTreeNode tree : trees) {
      doc.annotate("sentence", new Span(tree.start, tree.end), new FeatureSet());
      annotate(doc, tree);
    }

    return new Treebank(doc, trees);
  }
예제 #2
0
 /**
  * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
  * tree structure of a set of trees <CODE>trees</CODE>. This version is provided for parse tree
  * files which include sentence offsets.
  *
  * @param trees list of parse trees
  * @param offsets list of the starting position (in doc) of the text corresponding to each parse
  *     tree
  * @param doc document to which annotations should be added
  * @param targetAnnotation name of annotation to get 'parse' feature pointing to parse tree
  * @param span target span.
  * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use
  *     categories from Jet
  */
 public void addAnnotations(
     List<ParseTreeNode> trees,
     List<Integer> offsets,
     Document doc,
     String targetAnnotation,
     Span span,
     boolean jetCategories) {
   if (trees.size() != offsets.size()) {
     System.err.println(
         "PTBReader.addAnnotations:  mismatch between number of "
             + "trees ("
             + trees.size()
             + ") and number of offsets ("
             + offsets.size()
             + ")");
     return;
   }
   for (int i = 0; i < trees.size(); i++) {
     ParseTreeNode tree = trees.get(i);
     int start = offsets.get(i);
     if (start < 0) {
       System.err.println("PTBReader.addAnnotations:  offset missing for " + " parse tree " + i);
       continue;
     }
     int end = (i + 1 == offsets.size()) ? span.end() : offsets.get(i + 1);
     Span sentenceSpan = new Span(start, end);
     addAnnotations(tree, doc, sentenceSpan, jetCategories);
     Vector<Annotation> anns = doc.annotationsAt(start, targetAnnotation);
     if (anns != null && anns.size() > 0) {
       Annotation ann = anns.get(0);
       ann.put("parse", tree.ann);
     }
   }
 }
예제 #3
0
  /**
   * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
   * tree structure of a set of trees <CODE>trees</CODE>.
   *
   * @param trees list of parse trees
   * @param doc document to which annotations should be added
   * @param targetAnnotation name of annotation to determine spans to add parse tree annotations.
   * @param span target span.
   * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use
   *     categories from Jet
   */
  public void addAnnotations(
      List<ParseTreeNode> trees,
      Document doc,
      String targetAnnotation,
      Span span,
      boolean jetCategories) {
    List<Annotation> targetList = (List<Annotation>) doc.annotationsOfType(targetAnnotation, span);
    Comparator<Annotation> cmp =
        new Comparator<Annotation>() {
          public int compare(Annotation a, Annotation b) {
            return a.span().compareTo(b.span());
          }
        };

    Collections.sort(targetList, cmp);
    if (trees.size() != targetList.size()) {
      System.err.println(
          "PTBReader.addAnnotations:  mismatch between number of "
              + targetAnnotation
              + " ("
              + targetList.size()
              + ") and number of trees ("
              + trees.size()
              + ")");
    }
    int n = Math.min(trees.size(), targetList.size());
    for (int i = 0; i < n; i++) {
      ParseTreeNode tree = trees.get(i);
      addAnnotations(tree, doc, targetList.get(i).span(), jetCategories);
      targetList.get(i).put("parse", tree.ann);
    }
  }
예제 #4
0
  /**
   * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
   * tree structure <CODE>tree</CODE>.
   *
   * @param tree the parse tree (for a portion of Document doc)
   * @param doc the document
   * @param span the portion of doc covered by the parse tree
   * @param jetCategories if true, use Jet categories as terminal categories (if false, use
   *     categories read from parse trees)
   */
  public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) {
    List<ParseTreeNode> terminalNodes = getTerminalNodes(tree);
    String text = doc.text();
    int offset = span.start();

    for (ParseTreeNode terminal : terminalNodes) {
      while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
        offset++;
      }
      for (String skipString : skip) {
        if (text.startsWith(skipString, offset)) {
          offset += skipString.length();
          while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
            offset++;
          }
          break;
        }
      }
      // match next terminal node against next word in text
      int matchLength = matchTextToTree(text, offset, terminal.word);
      if (matchLength > 0) {
        int endOffset = offset + matchLength;
        while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
          endOffset++;
        }
        terminal.start = offset;
        terminal.end = endOffset;
        offset = endOffset;
      } else {
        System.err.println(
            "PTBReader.addAnnotations:  "
                + "Cannot determine parse tree offset for word "
                + terminal.word);
        System.err.println("  at document offset " + offset + " in sentence");
        System.err.println("  " + doc.text(span));
        return;
      }
    }

    if (jetCategories) {
      setJetAnnotations(tree, span, doc);
      StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<<
    } else {
      determineNonTerminalSpans(tree, span.start());
      setAnnotations(tree, doc);
    }
  }
예제 #5
0
  private void annotate(Document doc, ParseTreeNode node) {
    doc.addAnnotation(node.ann);
    if (node.children != null) {
      Annotation[] children = new Annotation[node.children.length];
      for (int i = 0; i < node.children.length; i++) {
        children[i] = node.children[i].ann;
      }
      node.ann.put("children", children);

      for (ParseTreeNode child : node.children) {
        annotate(doc, child);
      }
    }

    if (node.children == null && isAddingTokens) {
      // TODO: adds `case' property
      doc.annotate("token", node.ann.span(), new FeatureSet());
    }
  }
예제 #6
0
  /**
   * converts a set of Penn TreeBank files into text documents. Invoked by: PTBReader inputDir
   * outputDir. Converts all files with extension .mrg in inputDir to text documents, and writes
   * them into outputDir.
   */
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.out.println("usage: java " + PTBReader.class.getName() + " ");
      System.exit(1);
    }

    File inputDir = new File(args[0]);
    File outputDir = new File(args[1]);
    PTBReader parser = new PTBReader();
    for (File file : getFiles(new File(args[0]), ".mrg")) {
      String outFilename = removeSuffix(getRelativePath(inputDir, file));
      File outFile = new File(outputDir, outFilename);
      outFile.getParentFile().mkdirs();

      Writer out = new FileWriter(outFile);
      Document doc = parser.load(file).getDocument();
      out.write(doc.text());
      out.close();
    }
  }
예제 #7
0
 /**
  * Creates annotations for each node in parse tree <CODE>node</NODE>.
  * These annotations are added to the parse tree and to the document
  * <CODE>doc</CODE>.  In constrast to <CODE>setAnnotations</CODE>,
  * the categories used for terminal nodes are Jet categories obtained by
  * Jet tokenization and lexical look-up.  This means that hyphenated
  * items are split, and multi-word names are reduced to a single node.
  *
  * @param node      the root of the parse tree
  * @param treeSpan  the span of the document matching the parse tree
  * @param doc       the document to which annotations will be added
  */
 private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) {
   StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false);
   StatParser.fixHyphenatedItems(doc);
   int nameConstitEnd = -1;
   List<ParseTreeNode> terminals = getTerminalNodes(node);
   for (ParseTreeNode terminal : terminals) {
     int terminalEnd = terminal.end;
     // is there a 'name' constituent or 'hyphword' constituent here?
     Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit");
     Annotation constit = null;
     Annotation nameConstit = null;
     Annotation hyphword = null;
     if (constits != null) {
       for (Annotation c : constits) {
         if (c.get("cat") == "name") {
           nameConstit = c;
         } else if (c.get("cat") == "hyphword") {
           hyphword = c;
         }
         if (constit == null) constit = c;
       }
     }
     if (hyphword != null) {
       nameConstit = null;
       constit = hyphword;
     }
     // if there is a name which is not part of a hyphword, associate the
     // name with this (first) terminal node, and mark any remaining terminal
     // nodes which match tokens in the name as empty
     if (nameConstit != null) {
       terminal.end = nameConstit.end();
       terminal.ann = nameConstit;
       nameConstitEnd = nameConstit.end();
     } else if (nameConstitEnd >= 0) {
       terminal.word = null;
     } else {
       Span span = new Span(terminal.start, terminal.end);
       String pennPOS = ((String) terminal.category).toUpperCase().intern();
       String word = terminal.word;
       terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS);
     }
     if (nameConstitEnd == terminalEnd) nameConstitEnd = -1;
   }
   // prune parse tree:  remove a node if it has no word or children
   pruneTree(node);
   determineNonTerminalSpans(node, treeSpan.start());
   // add head links
   if (hr == null) hr = HeadRule.createDefaultRule();
   hr.apply(node);
   // add annotations for non-terminals:
   Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node);
 }
  public void apply(Document doc, List<Object> values, Span span, DateTime ref) {
    Map params = getParameters();
    String value = (String) params.get("value");
    String diff = (String) params.get("diff");
    String dir = (String) params.get("dir");
    DateTime val = ref;
    if (value != null) {
      value = assignValues(value, values);
      val = new DateTime(value);
    } else if (diff != null) {
      diff = assignValues(diff, values);
      Period period = new Period(diff);

      if (dir == null || dir.equals("plus")) {
        val = ref.plus(period);
      } else if (dir.equals("minus")) {
        val = ref.minus(period);
      }
    } else {
      val = ref;
      // use set_xxx
      for (Map.Entry entry : (Set<Map.Entry>) params.entrySet()) {
        Matcher m = Pattern.compile("set_(.*)").matcher((String) entry.getKey());
        if (m.matches()) {
          String field = assignValues((String) entry.getValue(), values);
          String fieldName = m.group(1);

          if (fieldName.equals("month")) {
            int month = Integer.parseInt(field);
            val = getTimeAnnotator().normalizeMonth(val, month);
          } else if (fieldName.equals("day")) {
            int day = Integer.parseInt(field);
            val = val.withField(DateTimeFieldType.dayOfMonth(), day);
          } else {
            throw new InternalError();
          }
        }
      }
    }

    String formattedDate = formatter.print(val);
    FeatureSet attrs = new FeatureSet();
    attrs.put("VAL", formattedDate);

    doc.annotate("TIMEX2", span, attrs);
  }
예제 #9
0
  /**
   * Creates annotations for each node in parse tree <CODE>node</NODE>.
   * These annotations are added to the parse tree;  in addition, if
   * Document <CODE>doc</CODE> is non-empty, they are added to the document.
   * <P>
   * Note that this method does not set the "children" attribute.
   *
   * @param node
   * @param doc
   */
  private void setAnnotations(ParseTreeNode node, Document doc) {
    Span span = new Span(node.start, node.end);
    FeatureSet attrs = new FeatureSet();
    attrs.put("cat", node.category);
    if (node.head != 0) {
      attrs.put("head", node.head);
    }
    if (node.function != null) {
      attrs.put("func", node.function);
    }

    node.ann = new Annotation("constit", span, attrs);
    if (doc != null) {
      doc.addAnnotation(node.ann);
    }

    if (node.children != null) {
      for (ParseTreeNode child : node.children) {
        setAnnotations(child, doc);
      }
    }
  }