/** * skip whitespace characters and comments (characters following a "#" on a line). Also, if a * skipped comment consists of a single integer, sets <CODE>offset</CODE> to that integer. * * @param in * @return count of skipped characters. * @throws IOException */ private int skipWhitespaceAndComment(PushbackReader in) throws IOException { int count = 0; boolean inComment = false; offset = -1; int c; do { c = in.read(); count++; if (c == '#' && !inComment) { inComment = true; comment.setLength(0); } else if (c == '\n' && inComment) { try { offset = Integer.parseInt(comment.toString().trim()); } catch (NumberFormatException e) { } inComment = false; } else if (inComment) { comment.append((char) c); } } while ((Character.isWhitespace(c) || inComment) && c != -1); if (c != -1) { in.unread(c); } return count - 1; }
/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure <CODE>tree</CODE>. * * @param tree the parse tree (for a portion of Document doc) * @param doc the document * @param span the portion of doc covered by the parse tree * @param jetCategories if true, use Jet categories as terminal categories (if false, use * categories read from parse trees) */ public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) { List<ParseTreeNode> terminalNodes = getTerminalNodes(tree); String text = doc.text(); int offset = span.start(); for (ParseTreeNode terminal : terminalNodes) { while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } for (String skipString : skip) { if (text.startsWith(skipString, offset)) { offset += skipString.length(); while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } break; } } // match next terminal node against next word in text int matchLength = matchTextToTree(text, offset, terminal.word); if (matchLength > 0) { int endOffset = offset + matchLength; while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) { endOffset++; } terminal.start = offset; terminal.end = endOffset; offset = endOffset; } else { System.err.println( "PTBReader.addAnnotations: " + "Cannot determine parse tree offset for word " + terminal.word); System.err.println(" at document offset " + offset + " in sentence"); System.err.println(" " + doc.text(span)); return; } } if (jetCategories) { setJetAnnotations(tree, span, doc); StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<< } else { determineNonTerminalSpans(tree, span.start()); setAnnotations(tree, doc); } }
/** * skip whitespace characters * * @param in * @return count of skipped characters. * @throws IOException */ private int skipWhitespace(PushbackReader in) throws IOException { int count = 0; int c; do { c = in.read(); count++; } while (Character.isWhitespace(c) && c != -1); if (c != -1) { in.unread(c); } return count - 1; }
/** * Reads a tag name which is after opened parenthesis. * * @param in * @return readed token string * @throws IOException * @throws InvalidFormatException */ private String readTagName(PushbackReader in) throws IOException, InvalidFormatException { StringBuilder buffer = new StringBuilder(); int c; while (true) { c = in.read(); if (c == -1) { throw new InvalidFormatException(); } else if (Character.isWhitespace(c)) { break; } buffer.append((char) c); } in.unread(c); if (buffer.length() == 0) { throw new InvalidFormatException(); } return buffer.toString().toLowerCase().intern(); }
/** * Remove last whitespace character and modify annotation span. * * @param annotations * @param buffer */ private void modifyAnnotationEnd(List<Annotation> annotations, StringBuilder buffer) { ListIterator<Annotation> it = annotations.listIterator(annotations.size()); if (buffer.length() == 0) { return; } if (!Character.isWhitespace(buffer.charAt(buffer.length() - 1))) { return; } while (it.hasPrevious()) { Annotation a = it.previous(); if (a.end() != buffer.length()) { break; } Span span = new Span(a.start(), a.end() - 1); Annotation replacement = new Annotation(a.type(), span, a.attributes()); it.set(replacement); } buffer.deleteCharAt(buffer.length() - 1); }
/** * Reads one node from a stream. * * @param in * @return readed node * @throws IOException * @throws InvalidFormatException */ private ParseTreeNode readNode(PushbackReader in) throws IOException, InvalidFormatException { int c = in.read(); if (c != '(') { throw new InvalidFormatException(); } if ((c = lookAhead(in)) == -1) { throw new InvalidFormatException(); } if (Character.isWhitespace(c) || c == '(') { skipWhitespace(in); ParseTreeNode node = readNode(in); skipWhitespace(in); c = (char) in.read(); if (c != ')') { throw new InvalidFormatException(); } return node; } String tag = readTagName(in); String function = null; Matcher m = tagNamePattern.matcher(tag); if (m.matches()) { tag = m.group(1); function = m.group(2); } else if (!specialTagNamePattern.matcher(tag).matches()) { throw new InvalidFormatException(tag + " is invalid format."); } if (skipWhitespace(in) == 0) { return null; } ParseTreeNode node; if (lookAhead(in) == '(') { // has any child node (not terminal node) List<ParseTreeNode> children = new ArrayList<ParseTreeNode>(); do { ParseTreeNode child = readNode(in); if (!isNullNode(child)) { children.add(child); } skipWhitespace(in); } while (lookAhead(in) != ')'); node = new ParseTreeNode(tag, children.toArray(new ParseTreeNode[0]), 0, 0, 0, function); } else { // terminal node String word = readWord(in); node = new ParseTreeNode(tag, null, 0, 0, null, word, function); } skipWhitespace(in); if (in.read() != ')') { throw new InvalidFormatException(); } return node; }