/** * Convert a single Tree[String] to Tree[StateSet] * * @param tree * @param numStates * @param tagNumberer * @return */ public static short[] initializeSubStateArray( List<Tree<String>> trainTrees, List<Tree<String>> validationTrees, Numberer tagNumberer, short nSubStates) { // boolean dontSplitTags) { // first generate unsplit grammar and lexicon short[] nSub = new short[2]; nSub[0] = 1; nSub[1] = nSubStates; // do the validation set so that the numberer sees all tags and we can // allocate big enough arrays // note: although this variable is never read, this constructor adds the // validation trees into the tagNumberer as a side effect, which is // important StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees, nSub, true, tagNumberer); @SuppressWarnings("unused") StateSetTreeList validationStateSetTrees = new StateSetTreeList(validationTrees, nSub, true, tagNumberer); StateSetTreeList.initializeTagNumberer(trainTrees, tagNumberer); StateSetTreeList.initializeTagNumberer(validationTrees, tagNumberer); short numStates = (short) tagNumberer.total(); short[] nSubStateArray = new short[numStates]; short two = nSubStates; Arrays.fill(nSubStateArray, two); // System.out.println("Everything is split in two except for the root."); nSubStateArray[0] = 1; // that's the ROOT return nSubStateArray; }
/** * @param previousGrammar * @param previousLexicon * @param grammar * @param lexicon * @param trainStateSetTrees * @return */ public static double doOneEStep( Grammar previousGrammar, Lexicon previousLexicon, Grammar grammar, Lexicon lexicon, StateSetTreeList trainStateSetTrees, boolean updateOnlyLexicon, int unkThreshold) { boolean secondHalf = false; ArrayParser parser = new ArrayParser(previousGrammar, previousLexicon); double trainingLikelihood = 0; int n = 0; int nTrees = trainStateSetTrees.size(); for (Tree<StateSet> stateSetTree : trainStateSetTrees) { secondHalf = (n++ > nTrees / 2.0); boolean noSmoothing = true, debugOutput = false; parser.doInsideOutsideScores(stateSetTree, noSmoothing, debugOutput); // E Step double ll = stateSetTree.getLabel().getIScore(0); ll = Math.log(ll) + (100 * stateSetTree.getLabel().getIScale()); // System.out.println(stateSetTree); if ((Double.isInfinite(ll) || Double.isNaN(ll))) { if (VERBOSE) { System.out.println("Training sentence " + n + " is given " + ll + " log likelihood!"); System.out.println( "Root iScore " + stateSetTree.getLabel().getIScore(0) + " scale " + stateSetTree.getLabel().getIScale()); } } else { lexicon.trainTree(stateSetTree, -1, previousLexicon, secondHalf, noSmoothing, unkThreshold); if (!updateOnlyLexicon) grammar.tallyStateSetTree(stateSetTree, previousGrammar); // E Step trainingLikelihood += ll; // there are for some reason some sentences that are unparsable } } lexicon.tieRareWordStats(unkThreshold); // SSIE ((SophisticatedLexicon) lexicon).overwriteWithMaxent(); return trainingLikelihood; }