/** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; System.err.println("Writing parser in serialized format to file " + filename + ' '); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); System.err.println("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(Sentence.listToString(gold.preTerminalYield())); pw.println(Sentence.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); }
/** * transformTree does all language-specific tree transformations. Any parameterizations should be * inside the specific TreebankLangParserParams class. */ @Override public Tree transformTree(Tree t, Tree root) { if (t == null || t.isLeaf()) { return t; } String parentStr; String grandParentStr; Tree parent; Tree grandParent; if (root == null || t.equals(root)) { parent = null; parentStr = ""; } else { parent = t.parent(root); parentStr = parent.label().value(); } if (parent == null || parent.equals(root)) { grandParent = null; grandParentStr = ""; } else { grandParent = parent.parent(root); grandParentStr = grandParent.label().value(); } String baseParentStr = ctlp.basicCategory(parentStr); String baseGrandParentStr = ctlp.basicCategory(grandParentStr); CoreLabel lab = (CoreLabel) t.label(); String word = lab.word(); String tag = lab.tag(); String baseTag = ctlp.basicCategory(tag); String category = lab.value(); String baseCategory = ctlp.basicCategory(category); if (t.isPreTerminal()) { // it's a POS tag List<String> leftAunts = listBasicCategories(SisterAnnotationStats.leftSisterLabels(parent, grandParent)); List<String> rightAunts = listBasicCategories(SisterAnnotationStats.rightSisterLabels(parent, grandParent)); // Chinese-specific punctuation splits if (chineseSplitPunct && baseTag.equals("PU")) { if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().accept(word)) { tag = tag + "-DOU"; // System.out.println("Punct: Split dou hao"); // debugging } else if (ChineseTreebankLanguagePack.chineseCommaAcceptFilter().accept(word)) { tag = tag + "-COMMA"; // System.out.println("Punct: Split comma"); // debugging } else if (ChineseTreebankLanguagePack.chineseColonAcceptFilter().accept(word)) { tag = tag + "-COLON"; // System.out.println("Punct: Split colon"); // debugging } else if (ChineseTreebankLanguagePack.chineseQuoteMarkAcceptFilter().accept(word)) { if (chineseSplitPunctLR) { if (ChineseTreebankLanguagePack.chineseLeftQuoteMarkAcceptFilter().accept(word)) { tag += "-LQUOTE"; } else { tag += "-RQUOTE"; } } else { tag = tag + "-QUOTE"; } // System.out.println("Punct: Split quote"); // debugging } else if (ChineseTreebankLanguagePack.chineseEndSentenceAcceptFilter().accept(word)) { tag = tag + "-ENDSENT"; // System.out.println("Punct: Split end sent"); // debugging } else if (ChineseTreebankLanguagePack.chineseParenthesisAcceptFilter().accept(word)) { if (chineseSplitPunctLR) { if (ChineseTreebankLanguagePack.chineseLeftParenthesisAcceptFilter().accept(word)) { tag += "-LPAREN"; } else { tag += "-RPAREN"; } } else { tag += "-PAREN"; // printlnErr("Just used -PAREN annotation"); // printlnErr(word); // throw new RuntimeException(); } // System.out.println("Punct: Split paren"); // debugging } else if (ChineseTreebankLanguagePack.chineseDashAcceptFilter().accept(word)) { tag = tag + "-DASH"; // System.out.println("Punct: Split dash"); // debugging } else if (ChineseTreebankLanguagePack.chineseOtherAcceptFilter().accept(word)) { tag = tag + "-OTHER"; } else { printlnErr("Unknown punct (you should add it to CTLP): " + tag + " |" + word + "|"); } } else if (chineseSplitDouHao) { // only split DouHao if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().accept(word) && baseTag.equals("PU")) { tag = tag + "-DOU"; } } // Chinese-specific POS tag splits (non-punctuation) if (tagWordSize) { int l = word.length(); tag += "-" + l + "CHARS"; } if (mergeNNVV && baseTag.equals("NN")) { tag = "VV"; } if ((chineseSelectiveTagPA || chineseVerySelectiveTagPA) && (baseTag.equals("CC") || baseTag.equals("P"))) { tag += "-" + baseParentStr; } if (chineseSelectiveTagPA && (baseTag.equals("VV"))) { tag += "-" + baseParentStr; } if (markMultiNtag && tag.startsWith("N")) { for (int i = 0; i < parent.numChildren(); i++) { if (parent.children()[i].label().value().startsWith("N") && parent.children()[i] != t) { tag += "=N"; // System.out.println("Found multi=N rewrite"); } } } if (markVVsisterIP && baseTag.equals("VV")) { boolean seenIP = false; for (int i = 0; i < parent.numChildren(); i++) { if (parent.children()[i].label().value().startsWith("IP")) { seenIP = true; } } if (seenIP) { tag += "-IP"; // System.out.println("Found VV with IP sister"); // testing } } if (markPsisterIP && baseTag.equals("P")) { boolean seenIP = false; for (int i = 0; i < parent.numChildren(); i++) { if (parent.children()[i].label().value().startsWith("IP")) { seenIP = true; } } if (seenIP) { tag += "-IP"; } } if (markADgrandchildOfIP && baseTag.equals("AD") && baseGrandParentStr.equals("IP")) { tag += "~IP"; // System.out.println("Found AD with IP grandparent"); // testing } if (gpaAD && baseTag.equals("AD")) { tag += "~" + baseGrandParentStr; // System.out.println("Found AD with grandparent " + grandParentStr); // testing } if (markPostverbalP && leftAunts.contains("VV") && baseTag.equals("P")) { // System.out.println("Found post-verbal P"); tag += "^=lVV"; } // end Chinese-specific tag splits Label label = new CategoryWordTag(tag, word, tag); t.setLabel(label); } else { // it's a phrasal category Tree[] kids = t.children(); // Chinese-specific category splits List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent)); List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent)); if (paRootDtr && baseParentStr.equals("ROOT")) { category += "^ROOT"; } if (markIPsisterBA && baseCategory.equals("IP")) { if (leftSis.contains("BA")) { category += "=BA"; // System.out.println("Found IP sister of BA"); } } if (dominatesV && hasV(t.preTerminalYield())) { // mark categories containing a verb category += "-v"; } if (markIPsisterVVorP && baseCategory.equals("IP")) { // todo: cdm: is just looking for "P" here selective enough?? if (leftSis.contains("VV") || leftSis.contains("P")) { category += "=VVP"; } } if (markIPsisDEC && baseCategory.equals("IP")) { if (rightSis.contains("DEC")) { category += "=DEC"; // System.out.println("Found prenominal IP"); } } if (baseCategory.equals("VP")) { // cdm 2008: this used to just check that it startsWith("VP"), but // I think that was bad because it also matched VPT verb compounds if (chineseSplitVP == 3) { boolean hasCC = false; boolean hasPU = false; boolean hasLexV = false; for (Tree kid : kids) { if (kid.label().value().startsWith("CC")) { hasCC = true; } else if (kid.label().value().startsWith("PU")) { hasPU = true; } else if (StringUtils.lookingAt( kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) { hasLexV = true; } } if (hasCC || (hasPU && !hasLexV)) { category += "-CRD"; // System.out.println("Found coordinate VP"); // testing } else if (hasLexV) { category += "-COMP"; // System.out.println("Found complementing VP"); // testing } else { category += "-ADJT"; // System.out.println("Found adjoining VP"); // testing } } else if (chineseSplitVP >= 1) { boolean hasBA = false; for (Tree kid : kids) { if (kid.label().value().startsWith("BA")) { hasBA = true; } else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) { for (Tree kidkid : kid.children()) { if (kidkid.label().value().startsWith("BA")) { hasBA = true; } } } } if (hasBA) { category += "-BA"; } } } if (markVPadjunct && baseParentStr.equals("VP")) { // cdm 2008: This used to use startsWith("VP") but changed to baseCat Tree[] sisters = parent.children(); boolean hasVPsister = false; boolean hasCC = false; boolean hasPU = false; boolean hasLexV = false; for (Tree sister : sisters) { if (tlp.basicCategory(sister.label().value()).equals("VP")) { hasVPsister = true; } if (sister.label().value().startsWith("CC")) { hasCC = true; } if (sister.label().value().startsWith("PU")) { hasPU = true; } if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) { hasLexV = true; } } if (hasVPsister && !(hasCC || hasPU || hasLexV)) { category += "-VPADJ"; // System.out.println("Found adjunct of VP"); // testing } } if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) { if (rightSis.contains("NP")) { category += "=MODIFIERNP"; // System.out.println("Found NP modifier of NP"); // testing } } if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) { if (rightSis.isEmpty() && (leftSis.contains("ADJP") || leftSis.contains("NP") || leftSis.contains("DNP") || leftSis.contains("QP") || leftSis.contains("CP") || leftSis.contains("PP"))) { category += "=MODIFIEDNP"; // System.out.println("Found modified NP"); // testing } } if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) { if (rightSis.contains("CC") || rightSis.contains("PU") || leftSis.contains("CC") || leftSis.contains("PU")) { category += "=CONJ"; // System.out.println("Found NP conjunct"); // testing } } if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) { Tree[] sisters = parent.children(); boolean hasCommaSis = false; boolean hasIPSis = false; for (Tree sister : sisters) { if (ctlp.basicCategory(sister.label().value()).equals("PU") && ChineseTreebankLanguagePack.chineseCommaAcceptFilter() .accept(sister.children()[0].label().toString())) { hasCommaSis = true; // System.out.println("Found CommaSis"); // testing } if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) { hasIPSis = true; } } if (hasCommaSis && hasIPSis) { category += "-CONJ"; // System.out.println("Found IP conjunct"); // testing } } if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) { category += "-U"; // System.out.println("Found unary IP"); //testing } if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) { category += "-U"; // System.out.println("Found unary CP"); //testing } if (splitBaseNP && baseCategory.equals("NP")) { if (t.isPrePreTerminal()) { category = category + "-B"; } } // if (Test.verbose) printlnErr(baseCategory + " " + leftSis.toString()); //debugging if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) { // System.out.println("Found post-verbal PP"); category += "=lVV"; } if ((markADgrandchildOfIP || gpaAD) && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) { category += "^ADVP"; } if (markCC) { // was: for (int i = 0; i < kids.length; i++) { // This second version takes an idea from Collins: don't count // marginal conjunctions which don't conjoin 2 things. for (int i = 1; i < kids.length - 1; i++) { String cat2 = kids[i].label().value(); if (cat2.startsWith("CC")) { category += "-CC"; } } } Label label = new CategoryWordTag(category, word, tag); t.setLabel(label); } return t; }
/** @param args */ public static void main(String[] args) { if (args.length != 3) { System.err.printf( "Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName()); System.exit(-1); } Language language = Language.valueOf(args[0]); TreebankLangParserParams tlpp = language.params; if (language.equals(Language.Arabic)) { String[] options = {"-arabicFactored"}; tlpp.setOptionFlag(options, 0); } else { String[] options = {"-frenchFactored"}; tlpp.setOptionFlag(options, 0); } Treebank tb = tlpp.diskTreebank(); tb.loadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); String[] features = args[2].trim().split(","); for (String feature : features) { morphoSpec.activate(MorphoFeatureType.valueOf(feature)); } // Counters Counter<String> wordTagCounter = new ClassicCounter<>(30000); Counter<String> morphTagCounter = new ClassicCounter<>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); Counter<String> morphCounter = new ClassicCounter<>(500); Counter<String> wordCounter = new ClassicCounter<>(30000); Counter<String> tagCounter = new ClassicCounter<>(300); Counter<String> lemmaCounter = new ClassicCounter<>(25000); Counter<String> lemmaTagCounter = new ClassicCounter<>(25000); Counter<String> richTagCounter = new ClassicCounter<>(1000); Counter<String> reducedTagCounter = new ClassicCounter<>(500); Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500); Map<String, Set<String>> wordLemmaMap = Generics.newHashMap(); TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000); TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500); TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300); int numTrees = 0; for (Tree tree : tb) { for (Tree subTree : tree) { if (!subTree.isLeaf()) { tlpp.transformTree(subTree, tree); } } List<Label> pretermList = tree.preTerminalYield(); List<Label> yield = tree.yield(); assert yield.size() == pretermList.size(); int yieldLen = yield.size(); for (int i = 0; i < yieldLen; ++i) { String tag = pretermList.get(i).value(); String word = yield.get(i).value(); String morph = ((CoreLabel) yield.get(i)).originalText(); // Note: if there is no lemma, then we use the surface form. Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph); String lemma = lemmaTag.first(); String richTag = lemmaTag.second(); // WSGDEBUG if (tag.contains("MW")) lemma += "-MWE"; lemmaCounter.incrementCount(lemma); lemmaTagCounter.incrementCount(lemma + tag); richTagCounter.incrementCount(richTag); String reducedTag = morphoSpec.strToFeatures(richTag).toString(); reducedTagCounter.incrementCount(reducedTag); reducedTagLemmaCounter.incrementCount(reducedTag + lemma); wordTagCounter.incrementCount(word + tag); morphTagCounter.incrementCount(morph + tag); morphCounter.incrementCount(morph); wordCounter.incrementCount(word); tagCounter.incrementCount(tag); reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; if (wordLemmaMap.containsKey(word)) { wordLemmaMap.get(word).add(lemma); } else { Set<String> lemmas = Generics.newHashSet(1); wordLemmaMap.put(word, lemmas); } lemmaReducedTagCounter.incrementCount(lemma, reducedTag); reducedTagTagCounter.incrementCount(lemma + reducedTag, tag); tagReducedTagCounter.incrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.out.println("Language: " + language.toString()); System.out.printf("#trees:\t%d%n", numTrees); System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount()); System.out.printf("#words:\t%d%n", wordCounter.keySet().size()); System.out.printf("#tags:\t%d%n", tagCounter.keySet().size()); System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size()); System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size()); System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size()); System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size()); System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size()); System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size()); System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size()); System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size()); // Extra System.out.println("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) { String word = wordLemmas.getKey(); Set<String> lemmas = wordLemmas.getValue(); if (lemmas.size() == 0) { sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.size() > 1) { sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n"); continue; } String lemma = lemmas.iterator().next(); Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet(); if (reducedTags.size() > 1) { System.out.printf("%s --> %s%n", word, lemma); for (String reducedTag : reducedTags) { int count = lemmaReducedTagCounter.getCount(lemma, reducedTag); String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet()); System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.out.println(); } } System.out.println("=================="); System.out.println(sbNoLemma.toString()); System.out.println(sbMultLemmas.toString()); System.out.println("=================="); List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet()); Collections.sort(tags); for (String tag : tags) { System.out.println(tag); Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet(); for (String reducedTag : reducedTags) { int count = tagReducedTagCounter.getCount(tag, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.out.printf("\t%s\t%d%n", reducedTag, count); } System.out.println(); } System.out.println("=================="); }