コード例 #1
0
  /**
   * Convert a single Tree[String] to Tree[StateSet]
   *
   * @param tree
   * @param numStates
   * @param tagNumberer
   * @return
   */
  public static short[] initializeSubStateArray(
      List<Tree<String>> trainTrees,
      List<Tree<String>> validationTrees,
      Numberer tagNumberer,
      short nSubStates) {
    //			boolean dontSplitTags) {
    // first generate unsplit grammar and lexicon
    short[] nSub = new short[2];
    nSub[0] = 1;
    nSub[1] = nSubStates;

    // do the validation set so that the numberer sees all tags and we can
    // allocate big enough arrays
    // note: although this variable is never read, this constructor adds the
    // validation trees into the tagNumberer as a side effect, which is
    // important
    StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees, nSub, true, tagNumberer);
    @SuppressWarnings("unused")
    StateSetTreeList validationStateSetTrees =
        new StateSetTreeList(validationTrees, nSub, true, tagNumberer);

    StateSetTreeList.initializeTagNumberer(trainTrees, tagNumberer);
    StateSetTreeList.initializeTagNumberer(validationTrees, tagNumberer);

    short numStates = (short) tagNumberer.total();
    short[] nSubStateArray = new short[numStates];
    short two = nSubStates;
    Arrays.fill(nSubStateArray, two);
    // System.out.println("Everything is split in two except for the root.");
    nSubStateArray[0] = 1; // that's the ROOT
    return nSubStateArray;
  }
コード例 #2
0
  /**
   * @param previousGrammar
   * @param previousLexicon
   * @param grammar
   * @param lexicon
   * @param trainStateSetTrees
   * @return
   */
  public static double doOneEStep(
      Grammar previousGrammar,
      Lexicon previousLexicon,
      Grammar grammar,
      Lexicon lexicon,
      StateSetTreeList trainStateSetTrees,
      boolean updateOnlyLexicon,
      int unkThreshold) {
    boolean secondHalf = false;
    ArrayParser parser = new ArrayParser(previousGrammar, previousLexicon);
    double trainingLikelihood = 0;
    int n = 0;
    int nTrees = trainStateSetTrees.size();
    for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
      secondHalf = (n++ > nTrees / 2.0);
      boolean noSmoothing = true, debugOutput = false;
      parser.doInsideOutsideScores(stateSetTree, noSmoothing, debugOutput); // E Step
      double ll = stateSetTree.getLabel().getIScore(0);
      ll =
          Math.log(ll)
              + (100 * stateSetTree.getLabel().getIScale()); // System.out.println(stateSetTree);
      if ((Double.isInfinite(ll) || Double.isNaN(ll))) {
        if (VERBOSE) {
          System.out.println("Training sentence " + n + " is given " + ll + " log likelihood!");
          System.out.println(
              "Root iScore "
                  + stateSetTree.getLabel().getIScore(0)
                  + " scale "
                  + stateSetTree.getLabel().getIScale());
        }
      } else {
        lexicon.trainTree(stateSetTree, -1, previousLexicon, secondHalf, noSmoothing, unkThreshold);
        if (!updateOnlyLexicon) grammar.tallyStateSetTree(stateSetTree, previousGrammar); // E Step
        trainingLikelihood += ll; // there are for some reason some sentences that are unparsable
      }
    }
    lexicon.tieRareWordStats(unkThreshold);

    // SSIE
    ((SophisticatedLexicon) lexicon).overwriteWithMaxent();

    return trainingLikelihood;
  }