/** * Returns a canonical mapping for the specified nonterminal label; if <code>label</code> already * is in canonical form, it is returned. The canonical mapping refers to transformations performed * on nonterminals during the training process. Before obtaining a label's canonical form, it is * also stripped of all Treebank augmentations, meaning that only the characters before the first * occurrence of '-', '=' or '|' are kept. * * @return a <code>Symbol</code> with the same print name as <code>label</code>, except that all * training transformations and Treebank augmentations have been undone and stripped */ public final Symbol getCanonical(Symbol label) { if (outputLexLabels) { char lbracket = nonTreebankLeftBracket(); char rbracket = nonTreebankRightBracket(); int lbracketIdx = label.toString().indexOf(lbracket); int rbracketIdx = label.toString().indexOf(rbracket); if (lbracketIdx != -1 && rbracketIdx != -1) { String labelStr = label.toString(); Symbol unlexLabel = Symbol.get(labelStr.substring(0, lbracketIdx) + labelStr.substring(rbracketIdx + 1)); String canonStr = defaultGetCanonical(unlexLabel).toString(); return Symbol.get(canonStr + labelStr.substring(lbracketIdx, rbracketIdx + 1)); } } return defaultGetCanonical(label); }
/** * Provides data and methods speciifc to the structures found in the English Treebank (the Penn * Treebank) or any other treebank that conforms to the Treebank II annotation guidelines for * part-of-speech tagging and bracketing. */ public class Treebank extends danbikel.parser.lang.AbstractTreebank { // the characters that are delimiters of augmented nonterminal labels private static final String augmentationDelimStr = "-=|"; private static final boolean outputLexLabels = Settings.getBoolean(Settings.decoderOutputHeadLexicalizedLabels); // basic nodes in the English Treebank that will be transformed in a // preprocessing phase static final Symbol NP = Symbol.add("NP"); static final Symbol S = Symbol.add("S"); // transformed nodes static final Symbol baseNP = Symbol.add(NP + "B"); static final Symbol baseNPArg = Symbol.add(baseNP + "-A"); static final Symbol subjectlessS = Symbol.add(S + "G"); static final Symbol subjectAugmentation = Symbol.add("SBJ"); // other basic nodes static final Symbol WHNP = Symbol.add("WHNP"); static final Symbol CC = Symbol.add("CC"); static final Symbol comma = Symbol.add(","); static final Symbol lrb = Symbol.add("-LRB-"); static final Symbol lcb = Symbol.add("-LCB-"); static final Symbol rrb = Symbol.add("-RRB-"); static final Symbol rcb = Symbol.add("-RCB-"); // null element static final Symbol nullElementPreterminal = Symbol.add("-NONE-"); // possessive part of speech static final Symbol possessivePos = Symbol.add("POS"); // exception nonterminals for defaultParseNonterminal static final Symbol[] nonterminalExceptionArr = {lrb, rrb}; private static Symbol[][] canonicalLabelMapData = { {baseNP, NP}, {subjectlessS, S}, }; private static HashMap canonicalLabelMap = new HashMap(canonicalLabelMapData.length); static { for (int i = 0; i < canonicalLabelMapData.length; i++) canonicalLabelMap.put(canonicalLabelMapData[i][0], canonicalLabelMapData[i][1]); } private static String[] puncToRaiseElements = {",", ":"}; private static Set puncToRaise = new HashSet(); static { for (int i = 0; i < puncToRaiseElements.length; i++) puncToRaise.add(Symbol.add(puncToRaiseElements[i])); } private static String[] verbTagStrings = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}; private static Set verbTags = new HashSet(); static { for (int i = 0; i < verbTagStrings.length; i++) verbTags.add(Symbol.add(verbTagStrings[i])); } /** Constructs an English <code>Treebank</code> object. */ public Treebank() { super(); nonterminalExceptionSet = nonterminalExceptionArr; } /** * Returns <code>true</code> if <code>tree</code> represents a preterminal subtree (part-of-speech * tag and word). Specifically, this method returns <code>true</code> if <code>tree</code> is an * instance of <code>SexpList</code>, has a length of 2 and has a first list element of type * <code>Symbol</code>. */ public final boolean isPreterminal(Sexp tree) { return (tree.isList() && tree.list().length() == 2 && tree.list().get(0).isSymbol() && tree.list().get(1).isSymbol()); } /** * Returns <code>true</code> is the specified nonterminal label represents a sentence in the Penn * Treebank, that is, if the canonical version of <code>label</code> is equal to <code> * "S"</code>. * * @see Training#relabelSubjectlessSentences(Sexp) */ public boolean isSentence(Symbol label) { return getCanonical(label) == S; } public Symbol sentenceLabel() { return S; } /** * Returns the symbol that {@link Training#relabelSubjectlessSentences(Sexp)} will use for * sentences that have no subjects. */ public Symbol subjectlessSentenceLabel() { return subjectlessS; } public Symbol subjectAugmentation() { return subjectAugmentation; } /** * Returns <code>true</code> if the specified S-expression represents a preterminal whose terminal * element is the null element (<code>"-NONE-"</code>) for the Penn Treebank. * * @see Training#relabelSubjectlessSentences(Sexp) */ public boolean isNullElementPreterminal(Sexp tree) { return (isPreterminal(tree) && tree.list().get(0).symbol() == nullElementPreterminal); } /** * Returns <code>true</code> if the specified S-expression is a preterminal whose part of speech * is <code>","</code> or <code>":"</code>. */ public boolean isPuncToRaise(Sexp preterm) { return (isPreterminal(preterm) && puncToRaise.contains(preterm.list().first())); } public boolean isPunctuation(Symbol tag) { // return puncToRaise.contains(stripAugmentation(tag)); return puncToRaise.contains(tag); } /** * Returns <code>true</code> if the specified S-expression represents a preterminal that is the * possessive part of speech. This method is intended to be used by implementations of {@link * Training#addBaseNPs(Sexp)}. */ public boolean isPossessivePreterminal(Sexp tree) { return (isPreterminal(tree) && tree.list().get(0).symbol() == possessivePos); } /** * Returns <code>true</code> if the canonical version of the specified label is an NP for for * English Treebank. * * @see Training#addBaseNPs(Sexp) */ public boolean isNP(Symbol label) { return getCanonical(label) == NP; } public boolean isBaseNP(Symbol label) { return label == baseNP || label == baseNPArg; } /** * Returns the symbol with which {@link Training#addBaseNPs(Sexp)} will relabel base NPs. * * @see Training#addBaseNPs */ public Symbol baseNPLabel() { return baseNP; } /** * Returns <code>true</code> if the canonical version of the specified label is a WHNP in the * English Treebank. * * @see Training#addGapInformation(Sexp) */ public boolean isWHNP(Symbol label) { return getCanonical(label) == WHNP; } /** * Returns the symbol that {@link Training#addBaseNPs(Sexp)} should add as a parent if a base NP * is not dominated by an NP. */ public Symbol NPLabel() { return NP; } /** * Returns <code>true</code> if <code>label</code> is equal to the symbol whose print name is * <code>"CC"</code>. */ public boolean isConjunction(Symbol label) { // return getCanonical(label) == CC; return label == CC; } /** * Returns <code>true</code> if <code>preterminal</code> represents a terminal with one of the * following parts of speech: <tt>VB, VBD, VBG, VBN, VBP</tt> or <tt>VBZ</tt>. It is an error to * call this method with a <code>Sexp</code> object for which {@link #isPreterminal(Sexp)} returns * <code>false</code>.<br> * * @param preterminal the preterminal to test * @return <code>true</code> if <code>preterminal</code> is a verb */ public boolean isVerb(Sexp preterminal) { return isVerbTag(preterminal.list().get(0).symbol()); } public boolean isVerbTag(Symbol tag) { return verbTags.contains(tag); } public boolean isComma(Symbol word) { return word == comma; } public boolean isLeftParen(Symbol word) { return word == lrb || word == lcb; } public boolean isRightParen(Symbol word) { return word == rrb || word == rcb; } /** * Returns a canonical mapping for the specified nonterminal label; if <code>label</code> already * is in canonical form, it is returned. The canonical mapping refers to transformations performed * on nonterminals during the training process. Before obtaining a label's canonical form, it is * also stripped of all Treebank augmentations, meaning that only the characters before the first * occurrence of '-', '=' or '|' are kept. * * @return a <code>Symbol</code> with the same print name as <code>label</code>, except that all * training transformations and Treebank augmentations have been undone and stripped */ public final Symbol getCanonical(Symbol label) { if (outputLexLabels) { char lbracket = nonTreebankLeftBracket(); char rbracket = nonTreebankRightBracket(); int lbracketIdx = label.toString().indexOf(lbracket); int rbracketIdx = label.toString().indexOf(rbracket); if (lbracketIdx != -1 && rbracketIdx != -1) { String labelStr = label.toString(); Symbol unlexLabel = Symbol.get(labelStr.substring(0, lbracketIdx) + labelStr.substring(rbracketIdx + 1)); String canonStr = defaultGetCanonical(unlexLabel).toString(); return Symbol.get(canonStr + labelStr.substring(lbracketIdx, rbracketIdx + 1)); } } return defaultGetCanonical(label); } private Symbol defaultGetCanonical(Symbol label) { for (int i = 0; i < nonterminalExceptionSet.length; i++) if (label == nonterminalExceptionSet[i]) return label; Symbol strippedLabel = stripAugmentation(label); Symbol mapEntry = (Symbol) canonicalLabelMap.get(strippedLabel); return ((mapEntry == null) ? strippedLabel : mapEntry); } public final Symbol getCanonical(Symbol label, boolean stripAugmentations) { if (stripAugmentations) return getCanonical(label); else { Symbol mapEntry = (Symbol) canonicalLabelMap.get(label); return (mapEntry == null ? label : mapEntry); } } /** * Calls {@link Treebank#defaultParseNonterminal(Symbol, Nonterminal)} with the specified * arguments. * * @param label to the nonterminal label to parse * @param nonterminal the <code>Nonterminal</code> object to fill with the components of <code> * label</code> */ public Nonterminal parseNonterminal(Symbol label, Nonterminal nonterminal) { defaultParseNonterminal(label, nonterminal); return nonterminal; } /** * Returns a string of the three characters that serve as augmentation delimiters in the Penn * Treebank: <code>"-=|"</code>. */ public String augmentationDelimiters() { return augmentationDelimStr; } }
static { for (int i = 0; i < verbTagStrings.length; i++) verbTags.add(Symbol.add(verbTagStrings[i])); }
static { for (int i = 0; i < puncToRaiseElements.length; i++) puncToRaise.add(Symbol.add(puncToRaiseElements[i])); }