protected void processRoot( SentenceParser.Node root, List<String> sentence, List<String> tags, List<String> target) { if (root != null) { TreeElement[] elements = root.getElements(); for (int i = 0; i < elements.length; i++) { if (elements[i].isLeaf()) { processLeaf((SentenceParser.Leaf) elements[i], false, OTHER, sentence, tags, target); } else { processNode((SentenceParser.Node) elements[i], sentence, tags, target, null); } } } }
private void processNode( SentenceParser.Node node, List<String> sentence, List<String> tags, List<String> target, String inheritedTag) { String phraseTag = getChunkTag(node); boolean inherited = false; if (phraseTag.equals(OTHER) && inheritedTag != null) { phraseTag = inheritedTag; inherited = true; } TreeElement[] elements = node.getElements(); for (int i = 0; i < elements.length; i++) { if (elements[i].isLeaf()) { boolean isIntermediate = false; String tag = phraseTag; SentenceParser.Leaf leaf = (SentenceParser.Leaf) elements[i]; String localChunk = getChunkTag(leaf); if (localChunk != null && !tag.equals(localChunk)) { tag = localChunk; } if (isIntermediate(tags, target, tag) && (inherited || i > 0)) { isIntermediate = true; } if (!isIncludePunctuations() && leaf.getFunctionalTag() == null && (!(i + 1 < elements.length && elements[i + 1].isLeaf()) || !(i > 0 && elements[i - 1].isLeaf()))) { isIntermediate = false; tag = OTHER; } processLeaf(leaf, isIntermediate, tag, sentence, tags, target); } else { int before = target.size(); processNode((SentenceParser.Node) elements[i], sentence, tags, target, phraseTag); // if the child node was of a different type we should break the chunk sequence for (int j = target.size() - 1; j >= before; j--) { if (!target.get(j).endsWith("-" + phraseTag)) { phraseTag = OTHER; break; } } } } }
protected String getChunkTag(SentenceParser.Node node) { String tag = node.getSyntacticTag(); String phraseTag = tag.substring(tag.lastIndexOf(":") + 1); while (phraseTag.endsWith("-")) { phraseTag = phraseTag.substring(0, phraseTag.length() - 1); } // maybe we should use only np, vp and pp, but will keep ap and advp. if (phraseTag.equals("np") || phraseTag.equals("vp") || phraseTag.equals("pp") || phraseTag.equals("ap") || phraseTag.equals("advp") || phraseTag.equals("adjp")) { phraseTag = phraseTag.toUpperCase(); } else { phraseTag = OTHER; } return phraseTag; }