/** * @param headFinder If a headFinder is provided, then head percolation will be done for trees. * Otherwise, it must be called separately. */ public UnlabeledAttachmentEval(String str, boolean runningAverages, HeadFinder headFinder) { this(str, runningAverages, headFinder, Filters.<String>acceptFilter()); }
/** * This provides an implementation of parts of the TreebankLanguagePack API to reduce the load on * fresh implementations. Only the abstract methods below need to be implemented to give a * reasonable solution for a new language. * * @author Christopher Manning * @version 1.1 */ public abstract class AbstractTreebankLanguagePack implements TreebankLanguagePack { /** So changed versions deserialize correctly. */ private static final long serialVersionUID = -6506749780512708352L; // Grammatical function parameters /** * Default character for indicating that something is a grammatical fn; probably should be * overridden by lang specific ones */ protected char gfCharacter; protected static final char DEFAULT_GF_CHAR = '-'; /** Use this as the default encoding for Readers and Writers of Treebank data. */ public static final String DEFAULT_ENCODING = "UTF-8"; /** Gives a handle to the TreebankLanguagePack. */ public AbstractTreebankLanguagePack() { this(DEFAULT_GF_CHAR); } /** * Gives a handle to the TreebankLanguagePack. * * @param gfChar The character that sets of grammatical functions in node labels. */ public AbstractTreebankLanguagePack(char gfChar) { this.gfCharacter = gfChar; } /** * Returns a String array of punctuation tags for this treebank/language. * * @return The punctuation tags */ public abstract String[] punctuationTags(); /** * Returns a String array of punctuation words for this treebank/language. * * @return The punctuation words */ public abstract String[] punctuationWords(); /** * Returns a String array of sentence final punctuation tags for this treebank/language. * * @return The sentence final punctuation tags */ public abstract String[] sentenceFinalPunctuationTags(); /** * Returns a String array of punctuation tags that EVALB-style evaluation should ignore for this * treebank/language. Traditionally, EVALB has ignored a subset of the total set of punctuation * tags in the English Penn Treebank (quotes and period, comma, colon, etc., but not brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ public String[] evalBIgnoredPunctuationTags() { return punctuationTags(); } /** * Accepts a String that is a punctuation tag name, and rejects everything else. * * @return Whether this is a punctuation tag */ public boolean isPunctuationTag(String str) { return punctTagStringAcceptFilter.accept(str); } /** * Accepts a String that is a punctuation word, and rejects everything else. If one can't tell for * sure (as for ' in the Penn Treebank), it maks the best guess that it can. * * @return Whether this is a punctuation word */ public boolean isPunctuationWord(String str) { return punctWordStringAcceptFilter.accept(str); } /** * Accepts a String that is a sentence end punctuation tag, and rejects everything else. * * @return Whether this is a sentence final punctuation tag */ public boolean isSentenceFinalPunctuationTag(String str) { return sFPunctTagStringAcceptFilter.accept(str); } /** * Accepts a String that is a punctuation tag that should be ignored by EVALB-style evaluation, * and rejects everything else. Traditionally, EVALB has ignored a subset of the total set of * punctuation tags in the English Penn Treebank (quotes and period, comma, colon, etc., but not * brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ public boolean isEvalBIgnoredPunctuationTag(String str) { return eIPunctTagStringAcceptFilter.accept(str); } /** * Return a filter that accepts a String that is a punctuation tag name, and rejects everything * else. * * @return The filter */ public Filter<String> punctuationTagAcceptFilter() { return punctTagStringAcceptFilter; } /** * Return a filter that rejects a String that is a punctuation tag name, and rejects everything * else. * * @return The filter */ public Filter<String> punctuationTagRejectFilter() { return Filters.notFilter(punctTagStringAcceptFilter); } /** * Returns a filter that accepts a String that is a punctuation word, and rejects everything else. * If one can't tell for sure (as for ' in the Penn Treebank), it makes the best guess that it * can. * * @return The Filter */ public Filter<String> punctuationWordAcceptFilter() { return punctWordStringAcceptFilter; } /** * Returns a filter that accepts a String that is not a punctuation word, and rejects punctuation. * If one can't tell for sure (as for ' in the Penn Treebank), it makes the best guess that it * can. * * @return The Filter */ public Filter<String> punctuationWordRejectFilter() { return Filters.notFilter(punctWordStringAcceptFilter); } /** * Returns a filter that accepts a String that is a sentence end punctuation tag, and rejects * everything else. * * @return The Filter */ public Filter<String> sentenceFinalPunctuationTagAcceptFilter() { return sFPunctTagStringAcceptFilter; } /** * Returns a filter that accepts a String that is a punctuation tag that should be ignored by * EVALB-style evaluation, and rejects everything else. Traditionally, EVALB has ignored a subset * of the total set of punctuation tags in the English Penn Treebank (quotes and period, comma, * colon, etc., but not brackets) * * @return The Filter */ public Filter<String> evalBIgnoredPunctuationTagAcceptFilter() { return eIPunctTagStringAcceptFilter; } /** * Returns a filter that accepts everything except a String that is a punctuation tag that should * be ignored by EVALB-style evaluation. Traditionally, EVALB has ignored a subset of the total * set of punctuation tags in the English Penn Treebank (quotes and period, comma, colon, etc., * but not brackets) * * @return The Filter */ public Filter<String> evalBIgnoredPunctuationTagRejectFilter() { return Filters.notFilter(eIPunctTagStringAcceptFilter); } /** * Return the input Charset encoding for the Treebank. See documentation for the <code>Charset * </code> class. * * @return Name of Charset */ public String getEncoding() { return DEFAULT_ENCODING; } private static final char[] EMPTY_CHAR_ARRAY = new char[0]; /** * Return an array of characters at which a String should be truncated to give the basic syntactic * category of a label. The idea here is that Penn treebank style labels follow a syntactic * category with various functional and crossreferencing information introduced by special * characters (such as "NP-SBJ=1"). This would be truncated to "NP" by the array containing '-' * and "=". * * @return An array of characters that set off label name suffixes */ public char[] labelAnnotationIntroducingCharacters() { return EMPTY_CHAR_ARRAY; } /** * Returns the index of the first character that is after the basic label. That is, if category is * "NP-LGS", it returns 2. This routine assumes category != null. This routine returns 0 iff the * String is of length 0. This routine always returns a number <= category.length(), and so it * is safe to pass it as an argument to category.substring(). * * <p>NOTE: the routine should never allow the first character of a label to be taken as the * annotation introducing character, because in the Penn Treebank, "-" is a valid tag, but also * the character used to set off functional and co-indexing annotations. If the first letter is * such a character then a matched character is also not used, for -LRB- etc., iff there is an * intervening character (so --PU becomes -). * * @param category Phrasal category * @return The index of the first character that is after the basic label */ private int postBasicCategoryIndex(String category) { boolean sawAtZero = false; char seenAtZero = '\u0000'; int i = 0; for (int leng = category.length(); i < leng; i++) { char ch = category.charAt(i); if (isLabelAnnotationIntroducingCharacter(ch)) { if (i == 0) { sawAtZero = true; seenAtZero = ch; } else if (sawAtZero && i > 1 && ch == seenAtZero) { sawAtZero = false; } else { break; } } } return i; } /** * Returns the basic syntactic category of a String. This implementation basically truncates stuff * after an occurrence of one of the <code>labelAnnotationIntroducingCharacters()</code>. However, * there is also special case stuff to deal with labelAnnotationIntroducingCharacters in category * labels: (i) if the first char is in this set, it's never truncated (e.g., '-' or '=' as a * token), and (ii) if it starts with one of this set, a second instance of the same item from * this set is also excluded (to deal with '-LLB-', '-RCB-', etc.). * * @param category The whole String name of the label * @return The basic category of the String */ public String basicCategory(String category) { if (category == null) { return null; } return category.substring(0, postBasicCategoryIndex(category)); } public String stripGF(String category) { if (category == null) { return null; } int index = category.lastIndexOf(gfCharacter); if (index > 0) { category = category.substring(0, index); } return category; } /** * Returns a {@link Function Function} object that maps Strings to Strings according to this * TreebankLanguagePack's basicCategory() method. * * @return The String->String Function object */ public Function<String, String> getBasicCategoryFunction() { return new BasicCategoryStringFunction(this); } private static class BasicCategoryStringFunction implements Function<String, String>, Serializable { private static final long serialVersionUID = 1L; private TreebankLanguagePack tlp; BasicCategoryStringFunction(TreebankLanguagePack tlp) { this.tlp = tlp; } public String apply(String in) { return tlp.basicCategory(in); } } private static class CategoryAndFunctionStringFunction implements Function<String, String>, Serializable { private static final long serialVersionUID = 1L; private TreebankLanguagePack tlp; CategoryAndFunctionStringFunction(TreebankLanguagePack tlp) { this.tlp = tlp; } public String apply(String in) { return tlp.categoryAndFunction(in); } } /** * Returns the syntactic category and 'function' of a String. This normally involves truncating * numerical coindexation showing coreference, etc. By 'function', this means keeping, say, Penn * Treebank functional tags or ICE phrasal functions, perhaps returning them as <code> * category-function</code>. * * <p>This implementation strips numeric tags after label introducing characters (assuming that * non-numeric things are functional tags). * * @param category The whole String name of the label * @return A String giving the category and function */ public String categoryAndFunction(String category) { if (category == null) { return null; } String catFunc = category; int i = lastIndexOfNumericTag(catFunc); while (i >= 0) { catFunc = catFunc.substring(0, i); i = lastIndexOfNumericTag(catFunc); } return catFunc; } /** * Returns the index within this string of the last occurrence of a * isLabelAnnotationIntroducingCharacter which is followed by only digits, corresponding to a * numeric tag at the end of the string. Example: <code>lastIndexOfNumericTag("NP-TMP-1") returns * 6</code>. * * @param category A String category * @return The index within this string of the last occurrence of a * isLabelAnnotationIntroducingCharacter which is followed by only digits */ private int lastIndexOfNumericTag(String category) { if (category == null) { return -1; } int last = -1; for (int i = category.length() - 1; i >= 0; i--) { if (isLabelAnnotationIntroducingCharacter(category.charAt(i))) { boolean onlyDigitsFollow = false; for (int j = i + 1; j < category.length(); j++) { onlyDigitsFollow = true; if (!(Character.isDigit(category.charAt(j)))) { onlyDigitsFollow = false; break; } } if (onlyDigitsFollow) { last = i; } } } return last; } /** * Returns a {@link Function Function} object that maps Strings to Strings according to this * TreebankLanguagePack's categoryAndFunction() method. * * @return The String->String Function object */ public Function<String, String> getCategoryAndFunctionFunction() { return new CategoryAndFunctionStringFunction(this); } /** * Say whether this character is an annotation introducing character. * * @param ch The character to check * @return Whether it is an annotation introducing character */ public boolean isLabelAnnotationIntroducingCharacter(char ch) { char[] cutChars = labelAnnotationIntroducingCharacters(); for (char cutChar : cutChars) { if (ch == cutChar) { return true; } } return false; } /** * Accepts a String that is a start symbol of the treebank. * * @return Whether this is a start symbol */ public boolean isStartSymbol(String str) { return startSymbolAcceptFilter.accept(str); } /** * Return a filter that accepts a String that is a start symbol of the treebank, and rejects * everything else. * * @return The filter */ public Filter<String> startSymbolAcceptFilter() { return startSymbolAcceptFilter; } /** * Returns a String array of treebank start symbols. * * @return The start symbols */ public abstract String[] startSymbols(); /** * Returns a String which is the first (perhaps unique) start symbol of the treebank, or null if * none is defined. * * @return The start symbol */ public String startSymbol() { String[] ssyms = startSymbols(); if (ssyms == null || ssyms.length == 0) { return null; } return ssyms[0]; } private final Filter<String> punctTagStringAcceptFilter = Filters.collectionAcceptFilter(punctuationTags()); private final Filter<String> punctWordStringAcceptFilter = Filters.collectionAcceptFilter(punctuationWords()); private final Filter<String> sFPunctTagStringAcceptFilter = Filters.collectionAcceptFilter(sentenceFinalPunctuationTags()); private final Filter<String> eIPunctTagStringAcceptFilter = Filters.collectionAcceptFilter(evalBIgnoredPunctuationTags()); private final Filter<String> startSymbolAcceptFilter = Filters.collectionAcceptFilter(startSymbols()); /** * Return a tokenizer which might be suitable for tokenizing text that will be used with this * Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white * space). The implementation in AbstractTreebankLanguagePack returns a factory for {@link * WhitespaceTokenizer}. * * @return A tokenizer */ public TokenizerFactory<? extends HasWord> getTokenizerFactory() { return WhitespaceTokenizer.factory(false); } /** * Return a GrammaticalStructureFactory suitable for this language/treebank. (To be overridden in * subclasses.) * * @return A GrammaticalStructureFactory suitable for this language/treebank */ public GrammaticalStructureFactory grammaticalStructureFactory() { throw new UnsupportedOperationException( "No GrammaticalStructureFactory defined for " + getClass().getName()); } /** * Return a GrammaticalStructureFactory suitable for this language/treebank. (To be overridden in * subclasses.) * * @return A GrammaticalStructureFactory suitable for this language/treebank */ public GrammaticalStructureFactory grammaticalStructureFactory(Filter<String> puncFilt) { return grammaticalStructureFactory(); } /** * Return a GrammaticalStructureFactory suitable for this language/treebank. (To be overridden in * subclasses.) * * @return A GrammaticalStructureFactory suitable for this language/treebank */ public GrammaticalStructureFactory grammaticalStructureFactory( Filter<String> puncFilt, HeadFinder typedDependencyHeadFinder) { return grammaticalStructureFactory(); } public char getGfCharacter() { return gfCharacter; } public void setGfCharacter(char gfCharacter) { this.gfCharacter = gfCharacter; } /** {@inheritDoc} */ public TreeReaderFactory treeReaderFactory() { return new PennTreeReaderFactory(); } /** {@inheritDoc} */ public TokenizerFactory<Tree> treeTokenizerFactory() { return new TreeTokenizerFactory(treeReaderFactory()); } /** Returns a morphological feature specification for words in this language. */ public MorphoFeatureSpecification morphFeatureSpec() { return null; } }
/** * Returns a filter that accepts everything except a String that is a punctuation tag that should * be ignored by EVALB-style evaluation. Traditionally, EVALB has ignored a subset of the total * set of punctuation tags in the English Penn Treebank (quotes and period, comma, colon, etc., * but not brackets) * * @return The Filter */ public Filter<String> evalBIgnoredPunctuationTagRejectFilter() { return Filters.notFilter(eIPunctTagStringAcceptFilter); }
/** * Returns a filter that accepts a String that is not a punctuation word, and rejects punctuation. * If one can't tell for sure (as for ' in the Penn Treebank), it makes the best guess that it * can. * * @return The Filter */ public Filter<String> punctuationWordRejectFilter() { return Filters.notFilter(punctWordStringAcceptFilter); }
/** * Return a filter that rejects a String that is a punctuation tag name, and rejects everything * else. * * @return The filter */ public Filter<String> punctuationTagRejectFilter() { return Filters.notFilter(punctTagStringAcceptFilter); }
/** * Tests that we can extract the basic grammatical relations correctly from some hard-coded trees. * * <p>Sentence examples from the manual to at least test each relation. */ public void testBasicRelations() { Pair<String, String>[] examples = ErasureUtils.uncheckedCast( new Pair[] { // Gloss: Shanghai Pudong de orderly advance T( "(NP (DNP (NP (NP (NR 浦东)) (NP (NN 开发))) (DEG 的)) (ADJP (JJ 有序)) (NP (NN 进行)))", C( "nn(开发-2, 浦东-1)", "assmod(进行-5, 开发-2)", "case(开发-2, 的-3)", "amod(进行-5, 有序-4)", "root(ROOT-0, 进行-5)")), // Gloss: Shanghai Pudong expand and legal-system synchronous T( "(ROOT (IP (NP (NP (NR 上海) (NR 浦东)) (NP (NN 开发) (CC 与) (NN 法制) (NN 建设))) (VP (VV 同步))))", C( "nn(浦东-2, 上海-1)", "nn(建设-6, 浦东-2)", "conj(建设-6, 开发-3)", "cc(建设-6, 与-4)", "nn(建设-6, 法制-5)", "nsubj(同步-7, 建设-6)", "root(ROOT-0, 同步-7)")), // Gloss: this-year T("(LCP (NP (NT 近年)) (LC 来))", C("root(ROOT-0, 近年-1)", "case(近年-1, 来-2)")), // Gloss: according country and Shanghai de relevant law T( "(PP (P 根据) (NP (DNP (NP (NP (NN 国家)) (CC 和) (NP (NR 上海市))) (DEG 的)) (ADJP (JJ 有关)) (NP (NN 规定))))", C( "case(规定-7, 根据-1)", "conj(上海市-4, 国家-2)", "cc(上海市-4, 和-3)", "assmod(规定-7, 上海市-4)", "case(上海市-4, 的-5)", "amod(规定-7, 有关-6)", "root(ROOT-0, 规定-7)")), // Gloss: building is expand Shanghai de primary economic activity T( "(IP (NP (NN 建筑)) (VP (VC 是) (NP (CP (IP (VP (VV 开发) (NP (NR 浦东)))) (DEC 的)) (QP (CD 一) (CLP (M 项))) (ADJP (JJ 主要)) (NP (NN 经济) (NN 活动)))))", C( "nsubj(活动-10, 建筑-1)", "cop(活动-10, 是-2)", "relcl(活动-10, 开发-3)", "dobj(开发-3, 浦东-4)", "mark(开发-3, 的-5)", "nummod(项-7, 一-6)", "clf(活动-10, 项-7)", "amod(活动-10, 主要-8)", "nn(活动-10, 经济-9)", "root(ROOT-0, 活动-10)")), // Gloss: nickel has-been named modern industry de vitamins T( "(IP (NP (NN 镍)) (VP (SB 被) (VP (VV 称作) (NP (PU “) (DNP (NP (ADJP (JJ 现代)) (NP (NN 工业))) (DEG 的)) (NP (NN 维生素)) (PU ”)))))", C( "nsubjpass(称作-3, 镍-1)", "auxpass(称作-3, 被-2)", "root(ROOT-0, 称作-3)", "punct(维生素-8, “-4)", "amod(工业-6, 现代-5)", "assmod(维生素-8, 工业-6)", "case(工业-6, 的-7)", "dobj(称作-3, 维生素-8)", "punct(维生素-8, ”-9)")), // Gloss: once revealed then was included legal-system path T( "(IP (VP (VP (ADVP (AD 一)) (VP (VV 出现))) (VP (ADVP (AD 就)) (VP (SB 被) (VP (VV 纳入) (NP (NN 法制) (NN 轨道)))))))))))", C( "advmod(出现-2, 一-1)", "root(ROOT-0, 出现-2)", "advmod(纳入-5, 就-3)", "auxpass(纳入-5, 被-4)", "dep(出现-2, 纳入-5)", "nn(轨道-7, 法制-6)", "dobj(纳入-5, 轨道-7)")), T( "(IP (NP (NP (NR 格林柯尔)) (NP (NN 制冷剂)) (PRN (PU () (NP (NR 中国)) (PU ))) (ADJP (JJ 有限)) (NP (NN 公司))) (VP (VC 是) (NP (CP (CP (IP (NP (NP (NR 格林柯尔) (NN 集团) (NR 北美) (NN 公司)) (CC 与) (NP (NP (NR 中国) (NR 天津)) (NP (NN 开发区)) (ADJP (JJ 总)) (NP (NN 公司))) (CC 和) (NP (NP (NR 中国)) (NP (NR 南方)) (NP (NN 证券)) (ADJP (JJ 有限)) (NP (NN 公司)))) (VP (VV 合建))) (DEC 的))) (ADJP (JJ 合资)) (NP (NN 企业)))) (PU 。))", C( "nn(公司-7, 格林柯尔-1)", "nn(公司-7, 制冷剂-2)", "punct(中国-4, (-3)", "prnmod(公司-7, 中国-4)", "punct(中国-4, )-5)", "amod(公司-7, 有限-6)", "nsubj(企业-28, 公司-7)", "cop(企业-28, 是-8)", "nn(公司-12, 格林柯尔-9)", "nn(公司-12, 集团-10)", "nn(公司-12, 北美-11)", "conj(公司-24, 公司-12)", "cc(公司-24, 与-13)", "nn(天津-15, 中国-14)", "nn(公司-18, 天津-15)", "nn(公司-18, 开发区-16)", "amod(公司-18, 总-17)", "conj(公司-24, 公司-18)", "cc(公司-24, 和-19)", "nn(公司-24, 中国-20)", "nn(公司-24, 南方-21)", "nn(公司-24, 证券-22)", "amod(公司-24, 有限-23)", "nsubj(合建-25, 公司-24)", "relcl(企业-28, 合建-25)", "mark(合建-25, 的-26)", "amod(企业-28, 合资-27)", "root(ROOT-0, 企业-28)", "punct(企业-28, 。-29)")), T( "(IP (NP (NR 汕头) (NN 机场)) (VP (VV 开通) (NP (NN 国际) (NN 国内) (NN 航线)) (QP (CD 四十四) (CLP (M 条)))) (PU 。))", C( "nn(机场-2, 汕头-1)", "nsubj(开通-3, 机场-2)", "root(ROOT-0, 开通-3)", "nn(航线-6, 国际-4)", "nn(航线-6, 国内-5)", "dobj(开通-3, 航线-6)", "nummod(条-8, 四十四-7)", "range(开通-3, 条-8)", "punct(开通-3, 。-9)")), T( "(VP (NP (NT 以前)) (ADVP (AD 不)) (ADVP (AD 曾)) (VP (VV 遇到) (AS 过))))", C( "tmod(遇到-4, 以前-1)", "neg(遇到-4, 不-2)", "advmod(遇到-4, 曾-3)", "root(ROOT-0, 遇到-4)", "asp(遇到-4, 过-5)")), // TODO(pliang): add more test cases for all the relations not covered (see WARNING // below) }); Set<String> ignoreRelations = new HashSet<>(Arrays.asList("subj", "obj", "mod")); // Make sure all the relations are tested for Set<String> testedRelations = new HashSet<String>(); for (Pair<String, String> ex : examples) { for (String item : ex.second.split("\n")) testedRelations.add(item.substring(0, item.indexOf('('))); } for (String relation : UniversalChineseGrammaticalRelations.shortNameToGRel.keySet()) { if (!testedRelations.contains(relation)) if (!ignoreRelations.contains(relation)) { System.err.println("WARNING: relation '" + relation + "' not tested"); } } TreeReaderFactory trf = new PennTreeReaderFactory(); for (Pair<String, String> ex : examples) { String testTree = ex.first; String testAnswer = ex.second; // specifying our own TreeReaderFactory is vital so that functional // categories - that is -TMP and -ADV in particular - are not stripped off Tree tree = Tree.valueOf(testTree, trf); GrammaticalStructure gs = new UniversalChineseGrammaticalStructure(tree, Filters.acceptFilter()); // include punct assertEquals( "Unexpected CC processed dependencies for tree " + testTree, testAnswer, UniversalChineseGrammaticalStructure.dependenciesToString( gs, gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL), tree, false, false)); } }