List<Tree> prune(List<Tree> treeList, Label label, int start, int end) { // get reference tree if (treeList.size() == 1) { return treeList; } Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList); int goal = Numberer.getGlobalNumberer("states").number(label.value()); Tree tempTree = parser.extractBestParse(goal, start, end); // parser.restoreUnaries(tempTree); Tree pcfgTree = debinarizer.transformTree(tempTree); Set<Constituent> pcfgConstituents = pcfgTree.constituents(new LabeledScoredConstituentFactory()); // delete child labels that are not in reference but do not cross reference List<Tree> prunedChildren = new ArrayList<Tree>(); int childStart = 0; for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) { Tree child = testTree.getChild(c); boolean isExtra = true; int childEnd = childStart + child.yield().size(); Constituent childConstituent = new LabeledScoredConstituent(childStart, childEnd, child.label(), 0); if (pcfgConstituents.contains(childConstituent)) { isExtra = false; } if (childConstituent.crosses(pcfgConstituents)) { isExtra = false; } if (child.isLeaf() || child.isPreTerminal()) { isExtra = false; } if (pcfgTree.yield().size() != testTree.yield().size()) { isExtra = false; } if (!label.value().startsWith("NP^NP")) { isExtra = false; } if (isExtra) { System.err.println( "Pruning: " + child.label() + " from " + (childStart + start) + " to " + (childEnd + start)); System.err.println("Was: " + testTree + " vs " + pcfgTree); prunedChildren.addAll(child.getChildrenAsList()); } else { prunedChildren.add(child); } childStart = childEnd; } return prunedChildren; }
@Override public void evaluate(Tree guess, Tree gold, PrintWriter pw) { if (gold == null || guess == null) { System.err.printf( "%s: Cannot compare against a null gold or guess tree!\n", this.getClass().getName()); return; } else if (guess.yield().size() != gold.yield().size()) { System.err.println("Warning: yield differs:"); System.err.println("Guess: " + Sentence.listToString(guess.yield())); System.err.println("Gold: " + Sentence.listToString(gold.yield())); } super.evaluate(guess, gold, pw); }
public static String getCleanedUpYield(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); if (DEBUG) System.err.println(copyTree.toString()); String res = copyTree.yield().toString(); if (res.length() > 1) { res = res.substring(0, 1).toUpperCase() + res.substring(1); } // (ROOT (S (NP (NNP Jaguar) (NNS shares)) (VP (VBD skyrocketed) (NP (NN yesterday)) (PP (IN // after) (NP (NP (NNP Mr.) (NNP Ridley) (POS 's)) (NN announcement)))) (. .))) res = res.replaceAll("\\s([\\.,!\\?\\-;:])", "$1"); res = res.replaceAll("(\\$)\\s", "$1"); res = res.replaceAll("can not", "cannot"); res = res.replaceAll("\\s*-LRB-\\s*", " ("); res = res.replaceAll("\\s*-RRB-\\s*", ") "); res = res.replaceAll("\\s*([\\.,?!])\\s*", "$1 "); res = res.replaceAll("\\s+''", "''"); // res = res.replaceAll("\"", ""); res = res.replaceAll("``\\s+", "``"); res = res.replaceAll("\\-[LR]CB\\-", ""); // brackets, e.g., [sic] // remove extra spaces res = res.replaceAll("\\s\\s+", " "); res = res.trim(); return res; }
Tree prune(Tree tree, int start) { if (tree.isLeaf() || tree.isPreTerminal()) { return tree; } // check each node's children for deletion List<Tree> children = helper(tree.getChildrenAsList(), start); children = prune(children, tree.label(), start, start + tree.yield().size()); return tree.treeFactory().newTreeNode(tree.label(), children); }
private List<Tree> helper(List<Tree> treeList, int start) { List<Tree> newTreeList = new ArrayList<Tree>(treeList.size()); for (Tree tree : treeList) { int end = start + tree.yield().size(); newTreeList.add(prune(tree, start)); start = end; } return newTreeList; }
/** * Return a null list if we don't care about context words, return a list of the words at the * leaves of the tree if we do care */ private List<String> getContextWords(Tree tree) { List<String> words = null; if (op.trainOptions.useContextWords) { words = Generics.newArrayList(); List<Label> leaves = tree.yield(); for (Label word : leaves) { words.add(word.value()); } } return words; }
/** Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations( StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set( CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(Sentence.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) { // CharacterLevelTagExtender ext = new CharacterLevelTagExtender(); ClassicCounter<Integer> c = new ClassicCounter<Integer>(); for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) { Tree gold = (Tree) iterator.next(); StringBuilder goldChars = new StringBuilder(); ArrayList goldYield = gold.yield(); for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) { Word word = (Word) wordIter.next(); goldChars.append(word); } List<HasWord> ourWords = segment(goldChars.toString()); for (int i = 0; i < ourWords.size(); i++) { c.incrementCount(Integer.valueOf(ourWords.get(i).word().length())); } } return Distribution.getDistribution(c); }
public LinkedList<String> getKeyWrodsFromSentenceTest(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) { sentence.add(new Word(word)); } Tree parse = lp.parse(sentence); parse.pennPrint(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); System.out.println("The words of the sentence:"); for (Label lab : parse.yield()) { if (lab instanceof CoreLabel) { System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP)); } else { System.out.println(lab); } } System.out.println(); System.out.println("tagged"); System.out.println(parse.taggedYield()); List<CoreLabel> temp = parse.taggedLabeledYield(); for (Label l : temp) { String[] sss = l.toString().split("-"); String type = sss[0]; System.out.println(sss[0] + " " + sss[1] + " " + sent[Integer.parseInt(sss[1])]); } for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next()); return list; }
/** Set the tags of the original tokens and the leaves if they aren't already set */ public static void setMissingTags(CoreMap sentence, Tree tree) { List<TaggedWord> taggedWords = null; List<Label> leaves = null; List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); for (int i = 0; i < tokens.size(); ++i) { CoreLabel token = tokens.get(i); if (token.tag() == null) { if (taggedWords == null) { taggedWords = tree.taggedYield(); } if (leaves == null) { leaves = tree.yield(); } token.setTag(taggedWords.get(i).tag()); Label leaf = leaves.get(i); if (leaf instanceof HasTag) { ((HasTag) leaf).setTag(taggedWords.get(i).tag()); } } } }
public static void main(String[] args) { Options op = new Options(new EnglishTreebankParserParams()); // op.tlpParams may be changed to something else later, so don't use it till // after options are parsed. System.out.println(StringUtils.toInvocationString("FactoredParser", args)); String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj"; int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219; String serializeFile = null; int i = 0; while (i < args.length && args[i].startsWith("-")) { if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) { path = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) { trainLow = Integer.parseInt(args[i + 1]); trainHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) { testLow = Integer.parseInt(args[i + 1]); testHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) { serializeFile = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[i + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { System.err.println("illegal access" + e); throw new RuntimeException(e); } i += 2; } else if (args[i].equals("-encoding")) { // sets encoding for TreebankLangParserParams op.tlpParams.setInputEncoding(args[i + 1]); op.tlpParams.setOutputEncoding(args[i + 1]); i += 2; } else { i = op.setOptionOrWarn(args, i); } } // System.out.println(tlpParams.getClass()); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); op.trainOptions.sisterSplitters = new HashSet<String>(Arrays.asList(op.tlpParams.sisterSplitters())); // BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams); PrintWriter pw = op.tlpParams.pw(); op.testOptions.display(); op.trainOptions.display(); op.display(); op.tlpParams.display(); // setup tree transforms Treebank trainTreebank = op.tlpParams.memoryTreebank(); MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank(); // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank(); // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/"; // blippTreebank.loadPath(blippPath, "", true); Timing.startTime(); System.err.print("Reading trees..."); testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true)); if (op.testOptions.increasingLength) { Collections.sort(testTreebank, new TreeLengthComparator()); } trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true)); Timing.tick("done."); System.err.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlp); } TreeTransformer debinarizer = new Debinarizer(op.forceCNF); List<Tree> binaryTrainTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories( trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack()); if (op.trainOptions.deleteSplitters != null) { List<String> deleted = new ArrayList<String>(); for (String del : op.trainOptions.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } System.err.println("Removed from vertical splitters: " + deleted); } } if (op.trainOptions.selectivePostSplit) { TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op); Treebank annotatedTB = trainTreebank.transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories( annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack()); } if (op.trainOptions.hSelSplit) { binarizer.setDoSelectiveSplit(false); for (Tree tree : trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } // tree.pennPrint(tlpParams.pw()); tree = binarizer.transformTree(tree); // binaryTrainTrees.add(tree); } binarizer.setDoSelectiveSplit(true); } for (Tree tree : trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTrainTrees.add(tree); } if (op.testOptions.verbose) { binarizer.dumpStats(); } List<Tree> binaryTestTrees = new ArrayList<Tree>(); for (Tree tree : testTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTestTrees.add(tree); } Timing.tick("done."); // binarization BinaryGrammar bg = null; UnaryGrammar ug = null; DependencyGrammar dg = null; // DependencyGrammar dgBLIPP = null; Lexicon lex = null; Index<String> stateIndex = new HashIndex<String>(); // extract grammars Extractor<Pair<UnaryGrammar, BinaryGrammar>> bgExtractor = new BinaryGrammarExtractor(op, stateIndex); // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor(); // Extractor lexExtractor = new LexiconExtractor(); // Extractor dgExtractor = new DependencyMemGrammarExtractor(); if (op.doPCFG) { System.err.print("Extracting PCFG..."); Pair<UnaryGrammar, BinaryGrammar> bgug = null; if (op.trainOptions.cheatPCFG) { List<Tree> allTrees = new ArrayList<Tree>(binaryTrainTrees); allTrees.addAll(binaryTestTrees); bgug = bgExtractor.extract(allTrees); } else { bgug = bgExtractor.extract(binaryTrainTrees); } bg = bgug.second; bg.splitRules(); ug = bgug.first; ug.purgeRules(); Timing.tick("done."); } System.err.print("Extracting Lexicon..."); Index<String> wordIndex = new HashIndex<String>(); Index<String> tagIndex = new HashIndex<String>(); lex = op.tlpParams.lex(op, wordIndex, tagIndex); lex.train(binaryTrainTrees); Timing.tick("done."); if (op.doDep) { System.err.print("Extracting Dependencies..."); binaryTrainTrees.clear(); Extractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex); // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams,true)); // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new // TransformTreeDependency(op.tlpParams, true)); // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new // TransformTreeDependency(tlpParams)); // dg = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams)); // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2); dg = dgExtractor.extract( binaryTrainTrees); // uses information whether the words are known or not, discards // unknown words Timing.tick("done."); // System.out.print("Extracting Unknown Word Model..."); // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees); // Timing.tick("done."); System.out.print("Tuning Dependency Model..."); dg.tune(binaryTestTrees); // System.out.println("TUNE DEPS: "+tuneDeps); Timing.tick("done."); } BinaryGrammar boundBG = bg; UnaryGrammar boundUG = ug; GrammarProjection gp = new NullGrammarProjection(bg, ug); // serialization if (serializeFile != null) { System.err.print("Serializing parser..."); LexicalizedParser.saveParserDataToSerialized( new ParserData(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op), serializeFile); Timing.tick("done."); } // test: pcfg-parse and output ExhaustivePCFGParser parser = null; if (op.doPCFG) { parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex); } ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null); Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null); // Scorer scorer = parser; BiLexPCFGParser bparser = null; if (op.doPCFG && op.doDep) { bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex); } Evalb pcfgPE = new Evalb("pcfg PE", true); Evalb comboPE = new Evalb("combo PE", true); AbstractEval pcfgCB = new Evalb.CBEval("pcfg CB", true); AbstractEval pcfgTE = new TaggingEval("pcfg TE"); AbstractEval comboTE = new TaggingEval("combo TE"); AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE"); AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE"); AbstractEval depTE = new TaggingEval("depnd TE"); AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter()); AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter()); if (op.testOptions.evalb) { EvalbFormatWriter.initEVALBfiles(op.tlpParams); } // int[] countByLength = new int[op.testOptions.maxLength+1]; // Use a reflection ruse, so one can run this without needing the // tagger. Using a function rather than a MaxentTagger means we // can distribute a version of the parser that doesn't include the // entire tagger. Function<List<? extends HasWord>, ArrayList<TaggedWord>> tagger = null; if (op.testOptions.preTag) { try { Class[] argsClass = {String.class}; Object[] arguments = new Object[] {op.testOptions.taggerSerializedFile}; tagger = (Function<List<? extends HasWord>, ArrayList<TaggedWord>>) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger") .getConstructor(argsClass) .newInstance(arguments); } catch (Exception e) { System.err.println(e); System.err.println("Warning: No pretagging of sentences will be done."); } } for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) { Tree tree = testTreebank.get(tNum); int testTreeLen = tree.yield().size(); if (testTreeLen > op.testOptions.maxLength) { continue; } Tree binaryTree = binaryTestTrees.get(tNum); // countByLength[testTreeLen]++; System.out.println("-------------------------------------"); System.out.println("Number: " + (tNum + 1)); System.out.println("Length: " + testTreeLen); // tree.pennPrint(pw); // System.out.println("XXXX The binary tree is"); // binaryTree.pennPrint(pw); // System.out.println("Here are the tags in the lexicon:"); // System.out.println(lex.showTags()); // System.out.println("Here's the tagnumberer:"); // System.out.println(Numberer.getGlobalNumberer("tags").toString()); long timeMil1 = System.currentTimeMillis(); Timing.tick("Starting parse."); if (op.doPCFG) { // System.err.println(op.testOptions.forceTags); if (op.testOptions.forceTags) { if (tagger != null) { // System.out.println("Using a tagger to set tags"); // System.out.println("Tagged sentence as: " + // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false)); parser.parse(addLast(tagger.apply(cutLast(wordify(binaryTree.yield()))))); } else { // System.out.println("Forcing tags to match input."); parser.parse(cleanTags(binaryTree.taggedYield(), tlp)); } } else { // System.out.println("XXXX Parsing " + binaryTree.yield()); parser.parse(binaryTree.yieldHasWord()); } // Timing.tick("Done with pcfg phase."); } if (op.doDep) { dparser.parse(binaryTree.yieldHasWord()); // Timing.tick("Done with dependency phase."); } boolean bothPassed = false; if (op.doPCFG && op.doDep) { bothPassed = bparser.parse(binaryTree.yieldHasWord()); // Timing.tick("Done with combination phase."); } long timeMil2 = System.currentTimeMillis(); long elapsed = timeMil2 - timeMil1; System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec."); // System.out.println("PCFG Best Parse:"); Tree tree2b = null; Tree tree2 = null; // System.out.println("Got full best parse..."); if (op.doPCFG) { tree2b = parser.getBestParse(); tree2 = debinarizer.transformTree(tree2b); } // System.out.println("Debinarized parse..."); // tree2.pennPrint(); // System.out.println("DepG Best Parse:"); Tree tree3 = null; Tree tree3db = null; if (op.doDep) { tree3 = dparser.getBestParse(); // was: but wrong Tree tree3db = debinarizer.transformTree(tree2); tree3db = debinarizer.transformTree(tree3); tree3.pennPrint(pw); } // tree.pennPrint(); // ((Tree)binaryTrainTrees.get(tNum)).pennPrint(); // System.out.println("Combo Best Parse:"); Tree tree4 = null; if (op.doPCFG && op.doDep) { try { tree4 = bparser.getBestParse(); if (tree4 == null) { tree4 = tree2b; } } catch (NullPointerException e) { System.err.println("Blocked, using PCFG parse!"); tree4 = tree2b; } } if (op.doPCFG && !bothPassed) { tree4 = tree2b; } // tree4.pennPrint(); if (op.doDep) { depDE.evaluate(tree3, binaryTree, pw); depTE.evaluate(tree3db, tree, pw); } TreeTransformer tc = op.tlpParams.collinizer(); TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb(); if (op.doPCFG) { // System.out.println("XXXX Best PCFG was: "); // tree2.pennPrint(); // System.out.println("XXXX Transformed best PCFG is: "); // tc.transformTree(tree2).pennPrint(); // System.out.println("True Best Parse:"); // tree.pennPrint(); // tc.transformTree(tree).pennPrint(); pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); Tree tree4b = null; if (op.doDep) { comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw); tree4b = tree4; tree4 = debinarizer.transformTree(tree4); if (op.nodePrune) { NodePruner np = new NodePruner(parser, debinarizer); tree4 = np.prune(tree4); } // tree4.pennPrint(); comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } // pcfgTE.evaluate(tree2, tree); pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw); pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw); comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0)); // tc.transformTree(tree2).pennPrint(); tree2.pennPrint(pw); if (op.doDep) { System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0)); // tc.transformTree(tree4).pennPrint(pw); tree4.pennPrint(pw); } System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0)); /* if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) { System.out.println("SCORE INVERSION"); parser.validateBinarizedTree(binaryTree,0); } */ tree.pennPrint(pw); } // end if doPCFG if (op.testOptions.evalb) { if (op.doPCFG && op.doDep) { EvalbFormatWriter.writeEVALBline( tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4)); } else if (op.doPCFG) { EvalbFormatWriter.writeEVALBline( tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2)); } else if (op.doDep) { EvalbFormatWriter.writeEVALBline( tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db)); } } } // end for each tree in test treebank if (op.testOptions.evalb) { EvalbFormatWriter.closeEVALBfiles(); } // op.testOptions.display(); if (op.doPCFG) { pcfgPE.display(false, pw); System.out.println("Grammar size: " + stateIndex.size()); pcfgCB.display(false, pw); if (op.doDep) { comboPE.display(false, pw); } pcfgTE.display(false, pw); pcfgTEnoPunct.display(false, pw); if (op.doDep) { comboTE.display(false, pw); comboTEnoPunct.display(false, pw); } } if (op.doDep) { depTE.display(false, pw); depDE.display(false, pw); } if (op.doPCFG && op.doDep) { comboDE.display(false, pw); } // pcfgPE.printGoodBad(); }
/** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; System.err.println("Writing parser in serialized format to file " + filename + ' '); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); System.err.println("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(Sentence.listToString(gold.preTerminalYield())); pw.println(Sentence.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); }
/** * Run the Evalb scoring metric on guess/gold input. The default language is English. * * @param args */ public static void main(String[] args) { TreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = Integer.MAX_VALUE; boolean VERBOSE = false; String encoding = "UTF-8"; String guessFile = null; String goldFile = null; Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs); for (Map.Entry<String, String[]> opt : argsMap.entrySet()) { if (opt.getKey() == null) continue; if (opt.getKey().equals("-l")) { Language lang = Language.valueOf(opt.getValue()[0].trim()); tlpp = lang.params; } else if (opt.getKey().equals("-y")) { maxGoldYield = Integer.parseInt(opt.getValue()[0].trim()); } else if (opt.getKey().equals("-v")) { VERBOSE = true; } else if (opt.getKey().equals("-e")) { encoding = opt.getValue()[0]; } else { System.err.println(usage.toString()); System.exit(-1); } // Non-option arguments located at key null String[] rest = argsMap.get(null); if (rest == null || rest.length < minArgs) { System.err.println(usage.toString()); System.exit(-1); } goldFile = rest[0]; guessFile = rest[1]; } tlpp.setInputEncoding(encoding); final PrintWriter pwOut = tlpp.pw(); final Treebank guessTreebank = tlpp.diskTreebank(); guessTreebank.loadPath(guessFile); pwOut.println("GUESS TREEBANK:"); pwOut.println(guessTreebank.textualSummary()); final Treebank goldTreebank = tlpp.diskTreebank(); goldTreebank.loadPath(goldFile); pwOut.println("GOLD TREEBANK:"); pwOut.println(goldTreebank.textualSummary()); final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder()); final TreeTransformer tc = tlpp.collinizer(); // The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // // In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. final Iterator<Tree> goldItr = goldTreebank.iterator(); final Iterator<Tree> guessItr = guessTreebank.iterator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.hasNext() && goldItr.hasNext()) { Tree guessTree = guessItr.next(); List<? extends Label> guessYield = guessTree.yield(); guessLineId++; Tree goldTree = goldItr.next(); List<? extends Label> goldYield = goldTree.yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.size() > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.size() != guessYield.size()) { pwOut.printf( "Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId); skippedGuessTrees++; continue; } final Tree evalGuess = tc.transformTree(guessTree); evalGuess.indexLeaves(true); final Tree evalGold = tc.transformTree(goldTree); evalGold.indexLeaves(true); metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null)); } if (guessItr.hasNext() || goldItr.hasNext()) { System.err.printf( "Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.println( "================================================================================"); if (skippedGuessTrees != 0) pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); metric.display(true, pwOut); pwOut.println(); pwOut.close(); }
/** Execute with no arguments for usage. */ public static void main(String[] args) { if (!validateCommandLine(args)) { System.err.println(USAGE); System.exit(-1); } final TreebankLangParserParams tlpp = LANGUAGE.params; final PrintWriter pwOut = tlpp.pw(); final Treebank guessTreebank = tlpp.diskTreebank(); guessTreebank.loadPath(guessFile); pwOut.println("GUESS TREEBANK:"); pwOut.println(guessTreebank.textualSummary()); final Treebank goldTreebank = tlpp.diskTreebank(); goldTreebank.loadPath(goldFile); pwOut.println("GOLD TREEBANK:"); pwOut.println(goldTreebank.textualSummary()); final LeafAncestorEval metric = new LeafAncestorEval("LeafAncestor"); final TreeTransformer tc = tlpp.collinizer(); // The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // // In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. final Iterator<Tree> goldItr = goldTreebank.iterator(); final Iterator<Tree> guessItr = guessTreebank.iterator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.hasNext() && goldItr.hasNext()) { Tree guessTree = guessItr.next(); List<? extends Label> guessYield = guessTree.yield(); guessLineId++; Tree goldTree = goldItr.next(); List<? extends Label> goldYield = goldTree.yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.size() > MAX_GOLD_YIELD) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.size() != guessYield.size()) { pwOut.printf( "Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId); skippedGuessTrees++; continue; } final Tree evalGuess = tc.transformTree(guessTree); final Tree evalGold = tc.transformTree(goldTree); metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null)); } if (guessItr.hasNext() || goldItr.hasNext()) { System.err.printf( "Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.println( "================================================================================"); if (skippedGuessTrees != 0) pwOut.printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees); metric.display(true, pwOut); pwOut.close(); }
/** @param args */ public static void main(String[] args) { if (args.length != 3) { System.err.printf( "Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName()); System.exit(-1); } Language language = Language.valueOf(args[0]); TreebankLangParserParams tlpp = language.params; if (language.equals(Language.Arabic)) { String[] options = {"-arabicFactored"}; tlpp.setOptionFlag(options, 0); } else { String[] options = {"-frenchFactored"}; tlpp.setOptionFlag(options, 0); } Treebank tb = tlpp.diskTreebank(); tb.loadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); String[] features = args[2].trim().split(","); for (String feature : features) { morphoSpec.activate(MorphoFeatureType.valueOf(feature)); } // Counters Counter<String> wordTagCounter = new ClassicCounter<>(30000); Counter<String> morphTagCounter = new ClassicCounter<>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); Counter<String> morphCounter = new ClassicCounter<>(500); Counter<String> wordCounter = new ClassicCounter<>(30000); Counter<String> tagCounter = new ClassicCounter<>(300); Counter<String> lemmaCounter = new ClassicCounter<>(25000); Counter<String> lemmaTagCounter = new ClassicCounter<>(25000); Counter<String> richTagCounter = new ClassicCounter<>(1000); Counter<String> reducedTagCounter = new ClassicCounter<>(500); Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500); Map<String, Set<String>> wordLemmaMap = Generics.newHashMap(); TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000); TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500); TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300); int numTrees = 0; for (Tree tree : tb) { for (Tree subTree : tree) { if (!subTree.isLeaf()) { tlpp.transformTree(subTree, tree); } } List<Label> pretermList = tree.preTerminalYield(); List<Label> yield = tree.yield(); assert yield.size() == pretermList.size(); int yieldLen = yield.size(); for (int i = 0; i < yieldLen; ++i) { String tag = pretermList.get(i).value(); String word = yield.get(i).value(); String morph = ((CoreLabel) yield.get(i)).originalText(); // Note: if there is no lemma, then we use the surface form. Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph); String lemma = lemmaTag.first(); String richTag = lemmaTag.second(); // WSGDEBUG if (tag.contains("MW")) lemma += "-MWE"; lemmaCounter.incrementCount(lemma); lemmaTagCounter.incrementCount(lemma + tag); richTagCounter.incrementCount(richTag); String reducedTag = morphoSpec.strToFeatures(richTag).toString(); reducedTagCounter.incrementCount(reducedTag); reducedTagLemmaCounter.incrementCount(reducedTag + lemma); wordTagCounter.incrementCount(word + tag); morphTagCounter.incrementCount(morph + tag); morphCounter.incrementCount(morph); wordCounter.incrementCount(word); tagCounter.incrementCount(tag); reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; if (wordLemmaMap.containsKey(word)) { wordLemmaMap.get(word).add(lemma); } else { Set<String> lemmas = Generics.newHashSet(1); wordLemmaMap.put(word, lemmas); } lemmaReducedTagCounter.incrementCount(lemma, reducedTag); reducedTagTagCounter.incrementCount(lemma + reducedTag, tag); tagReducedTagCounter.incrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.out.println("Language: " + language.toString()); System.out.printf("#trees:\t%d%n", numTrees); System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount()); System.out.printf("#words:\t%d%n", wordCounter.keySet().size()); System.out.printf("#tags:\t%d%n", tagCounter.keySet().size()); System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size()); System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size()); System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size()); System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size()); System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size()); System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size()); System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size()); System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size()); // Extra System.out.println("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) { String word = wordLemmas.getKey(); Set<String> lemmas = wordLemmas.getValue(); if (lemmas.size() == 0) { sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.size() > 1) { sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n"); continue; } String lemma = lemmas.iterator().next(); Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet(); if (reducedTags.size() > 1) { System.out.printf("%s --> %s%n", word, lemma); for (String reducedTag : reducedTags) { int count = lemmaReducedTagCounter.getCount(lemma, reducedTag); String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet()); System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.out.println(); } } System.out.println("=================="); System.out.println(sbNoLemma.toString()); System.out.println(sbMultLemmas.toString()); System.out.println("=================="); List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet()); Collections.sort(tags); for (String tag : tags) { System.out.println(tag); Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet(); for (String reducedTag : reducedTags) { int count = tagReducedTagCounter.getCount(tag, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.out.printf("\t%s\t%d%n", reducedTag, count); } System.out.println(); } System.out.println("=================="); }