Пример #1
0
  /**
   * skip whitespace characters and comments (characters following a "#" on a line). Also, if a
   * skipped comment consists of a single integer, sets <CODE>offset</CODE> to that integer.
   *
   * @param in
   * @return count of skipped characters.
   * @throws IOException
   */
  private int skipWhitespaceAndComment(PushbackReader in) throws IOException {
    int count = 0;
    boolean inComment = false;
    offset = -1;
    int c;
    do {
      c = in.read();
      count++;
      if (c == '#' && !inComment) {
        inComment = true;
        comment.setLength(0);
      } else if (c == '\n' && inComment) {
        try {
          offset = Integer.parseInt(comment.toString().trim());
        } catch (NumberFormatException e) {
        }
        inComment = false;
      } else if (inComment) {
        comment.append((char) c);
      }
    } while ((Character.isWhitespace(c) || inComment) && c != -1);

    if (c != -1) {
      in.unread(c);
    }

    return count - 1;
  }
Пример #2
0
  /**
   * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
   * tree structure <CODE>tree</CODE>.
   *
   * @param tree the parse tree (for a portion of Document doc)
   * @param doc the document
   * @param span the portion of doc covered by the parse tree
   * @param jetCategories if true, use Jet categories as terminal categories (if false, use
   *     categories read from parse trees)
   */
  public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) {
    List<ParseTreeNode> terminalNodes = getTerminalNodes(tree);
    String text = doc.text();
    int offset = span.start();

    for (ParseTreeNode terminal : terminalNodes) {
      while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
        offset++;
      }
      for (String skipString : skip) {
        if (text.startsWith(skipString, offset)) {
          offset += skipString.length();
          while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
            offset++;
          }
          break;
        }
      }
      // match next terminal node against next word in text
      int matchLength = matchTextToTree(text, offset, terminal.word);
      if (matchLength > 0) {
        int endOffset = offset + matchLength;
        while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
          endOffset++;
        }
        terminal.start = offset;
        terminal.end = endOffset;
        offset = endOffset;
      } else {
        System.err.println(
            "PTBReader.addAnnotations:  "
                + "Cannot determine parse tree offset for word "
                + terminal.word);
        System.err.println("  at document offset " + offset + " in sentence");
        System.err.println("  " + doc.text(span));
        return;
      }
    }

    if (jetCategories) {
      setJetAnnotations(tree, span, doc);
      StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<<
    } else {
      determineNonTerminalSpans(tree, span.start());
      setAnnotations(tree, doc);
    }
  }
Пример #3
0
  /**
   * skip whitespace characters
   *
   * @param in
   * @return count of skipped characters.
   * @throws IOException
   */
  private int skipWhitespace(PushbackReader in) throws IOException {
    int count = 0;
    int c;
    do {
      c = in.read();
      count++;
    } while (Character.isWhitespace(c) && c != -1);

    if (c != -1) {
      in.unread(c);
    }

    return count - 1;
  }
Пример #4
0
  /**
   * Reads a tag name which is after opened parenthesis.
   *
   * @param in
   * @return readed token string
   * @throws IOException
   * @throws InvalidFormatException
   */
  private String readTagName(PushbackReader in) throws IOException, InvalidFormatException {
    StringBuilder buffer = new StringBuilder();
    int c;

    while (true) {
      c = in.read();
      if (c == -1) {
        throw new InvalidFormatException();
      } else if (Character.isWhitespace(c)) {
        break;
      }

      buffer.append((char) c);
    }

    in.unread(c);

    if (buffer.length() == 0) {
      throw new InvalidFormatException();
    }

    return buffer.toString().toLowerCase().intern();
  }
Пример #5
0
  /**
   * Remove last whitespace character and modify annotation span.
   *
   * @param annotations
   * @param buffer
   */
  private void modifyAnnotationEnd(List<Annotation> annotations, StringBuilder buffer) {
    ListIterator<Annotation> it = annotations.listIterator(annotations.size());

    if (buffer.length() == 0) {
      return;
    }

    if (!Character.isWhitespace(buffer.charAt(buffer.length() - 1))) {
      return;
    }

    while (it.hasPrevious()) {
      Annotation a = it.previous();
      if (a.end() != buffer.length()) {
        break;
      }

      Span span = new Span(a.start(), a.end() - 1);
      Annotation replacement = new Annotation(a.type(), span, a.attributes());
      it.set(replacement);
    }

    buffer.deleteCharAt(buffer.length() - 1);
  }
Пример #6
0
  /**
   * Reads one node from a stream.
   *
   * @param in
   * @return readed node
   * @throws IOException
   * @throws InvalidFormatException
   */
  private ParseTreeNode readNode(PushbackReader in) throws IOException, InvalidFormatException {
    int c = in.read();

    if (c != '(') {
      throw new InvalidFormatException();
    }

    if ((c = lookAhead(in)) == -1) {
      throw new InvalidFormatException();
    }

    if (Character.isWhitespace(c) || c == '(') {
      skipWhitespace(in);
      ParseTreeNode node = readNode(in);
      skipWhitespace(in);
      c = (char) in.read();
      if (c != ')') {
        throw new InvalidFormatException();
      }
      return node;
    }

    String tag = readTagName(in);
    String function = null;
    Matcher m = tagNamePattern.matcher(tag);
    if (m.matches()) {
      tag = m.group(1);
      function = m.group(2);
    } else if (!specialTagNamePattern.matcher(tag).matches()) {
      throw new InvalidFormatException(tag + " is invalid format.");
    }

    if (skipWhitespace(in) == 0) {
      return null;
    }

    ParseTreeNode node;

    if (lookAhead(in) == '(') {
      // has any child node (not terminal node)
      List<ParseTreeNode> children = new ArrayList<ParseTreeNode>();
      do {
        ParseTreeNode child = readNode(in);
        if (!isNullNode(child)) {
          children.add(child);
        }
        skipWhitespace(in);
      } while (lookAhead(in) != ')');

      node = new ParseTreeNode(tag, children.toArray(new ParseTreeNode[0]), 0, 0, 0, function);
    } else {
      // terminal node
      String word = readWord(in);
      node = new ParseTreeNode(tag, null, 0, 0, null, word, function);
    }

    skipWhitespace(in);
    if (in.read() != ')') {
      throw new InvalidFormatException();
    }

    return node;
  }