/** * Trains a boundary POS prioritization model (AKA a figure-of-merit, or FOM). Parses the training * corpus, constrained by the gold trees, and learns prioritization probabilities from the * resulting parses. * * @param sparseMatrixGrammar * @return a boundary POS figure of merit model. */ protected BoundaryPosModel trainPosFom(final LeftCscSparseMatrixGrammar sparseMatrixGrammar) { try { // Constrained parse the training corpus final ParserDriver opts = new ParserDriver(); opts.cellSelectorModel = ConstrainedCellSelector.MODEL; opts.researchParserType = ResearchParserType.ConstrainedCartesianProductHashMl; final ConstrainedCphSpmlParser constrainedParser = new ConstrainedCphSpmlParser(opts, sparseMatrixGrammar); final StringWriter binaryConstrainedParses = new StringWriter(30 * 1024 * 1024); for (final String inputTree : trainingCorpus) { final ParseTask parseTask = constrainedParser.parseSentence(inputTree); binaryConstrainedParses.write(parseTask.binaryParse.toString()); binaryConstrainedParses.write('\n'); } final StringWriter serializedFomModel = new StringWriter(30 * 1024 * 1024); BoundaryPosModel.train( sparseMatrixGrammar, new BufferedReader(new StringReader(binaryConstrainedParses.toString())), new BufferedWriter(serializedFomModel), .5f, false, 2); final BufferedReader fomModelReader = new BufferedReader(new StringReader(serializedFomModel.toString())); return new BoundaryPosModel(FOMType.BoundaryPOS, sparseMatrixGrammar, fomModelReader); } catch (final IOException e) { // StringWriter and StringReader should never IOException throw new AssertionError(e); } }
/** * Parses the development set with the specified grammar and FOM. Returns the accuracy (F1) of the * resulting parses. Uses the specified <code>beamWidth</code> for all cells spanning >= 2 words. * For lexical cells, uses <code>beamWidth</code> x 3 (since the FOM does not prioritize entries * in lexical cells), and allocates <code>beamWidth</code> entries in lexical cells for unary * productions. * * @param sparseMatrixGrammar * @param posFom * @param cycle * @return Accuracy (F1) and speed (w/s) */ protected float[] parseDevSet( final LeftCscSparseMatrixGrammar sparseMatrixGrammar, final BoundaryPosModel posFom, final int cycle) { // Initialize the parser final ParserDriver opts = new ParserDriver(); opts.researchParserType = ResearchParserType.CartesianProductHashMl; opts.cellSelectorModel = ccModel; opts.fomModel = posFom; // Set beam-width configuration properties GlobalConfigProperties.singleton() .setProperty(Parser.PROPERTY_MAX_BEAM_WIDTH, Integer.toString(beamWidth)); GlobalConfigProperties.singleton() .setProperty(Parser.PROPERTY_LEXICAL_ROW_BEAM_WIDTH, Integer.toString(beamWidth * 3)); GlobalConfigProperties.singleton() .setProperty(Parser.PROPERTY_LEXICAL_ROW_UNARIES, Integer.toString(beamWidth)); // Parse the dev-set final CartesianProductHashSpmlParser parser = new CartesianProductHashSpmlParser(opts, sparseMatrixGrammar); final long t0 = System.currentTimeMillis(); int words = 0; final BracketEvaluator evaluator = new BracketEvaluator(); for (final String inputTree : developmentSet) { parser.parseSentence(inputTree).evaluate(evaluator); final NaryTree<String> naryTree = NaryTree.read(inputTree, String.class); words += naryTree.leaves(); } final long t1 = System.currentTimeMillis(); return new float[] {(float) evaluator.accumulatedResult().f1(), words * 1000f / (t1 - t0)}; }