public static void main(String[] args) { if (args.length < minArgs) { System.out.println(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); Language language = PropertiesUtils.get(options, "l", Language.English, Language.class); TreebankLangParserParams tlpp = language.params; DiskTreebank tb = null; String encoding = options.getProperty("l", "UTF-8"); boolean removeBracket = PropertiesUtils.getBool(options, "b", false); tlpp.setInputEncoding(encoding); tlpp.setOutputEncoding(encoding); tb = tlpp.diskTreebank(); String[] files = options.getProperty("", "").split("\\s+"); if (files.length != 0) { for (String filename : files) { tb.loadPath(filename); } } else { log.info(usage()); System.exit(-1); } PrintWriter pwo = tlpp.pw(); String startSymbol = tlpp.treebankLanguagePack().startSymbol(); TreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; for (Tree t : tb) { if (removeBracket) { if (t.value().equals(startSymbol)) { t = t.firstChild(); } } else if (!t.value().equals(startSymbol)) { // Add a bracket if it isn't already there t = tf.newTreeNode(startSymbol, Collections.singletonList(t)); } pwo.println(t.toString()); nTrees++; } pwo.close(); System.err.printf("Processed %d trees.%n", nTrees); }
/** * Run the Evalb scoring metric on guess/gold input. The default language is English. * * @param args */ public static void main(String[] args) { TreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = Integer.MAX_VALUE; boolean VERBOSE = false; String encoding = "UTF-8"; String guessFile = null; String goldFile = null; Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs); for (Map.Entry<String, String[]> opt : argsMap.entrySet()) { if (opt.getKey() == null) continue; if (opt.getKey().equals("-l")) { Language lang = Language.valueOf(opt.getValue()[0].trim()); tlpp = lang.params; } else if (opt.getKey().equals("-y")) { maxGoldYield = Integer.parseInt(opt.getValue()[0].trim()); } else if (opt.getKey().equals("-v")) { VERBOSE = true; } else if (opt.getKey().equals("-e")) { encoding = opt.getValue()[0]; } else { System.err.println(usage.toString()); System.exit(-1); } // Non-option arguments located at key null String[] rest = argsMap.get(null); if (rest == null || rest.length < minArgs) { System.err.println(usage.toString()); System.exit(-1); } goldFile = rest[0]; guessFile = rest[1]; } tlpp.setInputEncoding(encoding); final PrintWriter pwOut = tlpp.pw(); final Treebank guessTreebank = tlpp.diskTreebank(); guessTreebank.loadPath(guessFile); pwOut.println("GUESS TREEBANK:"); pwOut.println(guessTreebank.textualSummary()); final Treebank goldTreebank = tlpp.diskTreebank(); goldTreebank.loadPath(goldFile); pwOut.println("GOLD TREEBANK:"); pwOut.println(goldTreebank.textualSummary()); final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder()); final TreeTransformer tc = tlpp.collinizer(); // The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // // In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. final Iterator<Tree> goldItr = goldTreebank.iterator(); final Iterator<Tree> guessItr = guessTreebank.iterator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.hasNext() && goldItr.hasNext()) { Tree guessTree = guessItr.next(); List<? extends Label> guessYield = guessTree.yield(); guessLineId++; Tree goldTree = goldItr.next(); List<? extends Label> goldYield = goldTree.yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.size() > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.size() != guessYield.size()) { pwOut.printf( "Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId); skippedGuessTrees++; continue; } final Tree evalGuess = tc.transformTree(guessTree); evalGuess.indexLeaves(true); final Tree evalGold = tc.transformTree(goldTree); evalGold.indexLeaves(true); metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null)); } if (guessItr.hasNext() || goldItr.hasNext()) { System.err.printf( "Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.println( "================================================================================"); if (skippedGuessTrees != 0) pwOut.printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); metric.display(true, pwOut); pwOut.println(); pwOut.close(); }