コード例 #1
0
  public String processOneSent(List<String> nbest, int sentID) {
    System.err.println("Now process sentence " + sentID);

    // step-0: preprocess
    // assumption: each hyp has a formate:
    // "sent_id ||| hyp_itself ||| feature scores ||| linear-combination-of-feature-scores(this
    // should be logP)"

    List<String> hypsItself = new ArrayList<String>();
    // ArrayList<String> l_feat_scores = new ArrayList<String>();
    List<Double> baselineScores = new ArrayList<Double>(); // linear combination of all baseline
    // features
    List<HashMap<String, Integer>> ngramTbls = new ArrayList<HashMap<String, Integer>>();
    List<Integer> sentLens = new ArrayList<Integer>();

    for (String hyp : nbest) {
      String[] fds = Regex.threeBarsWithSpace.split(hyp);
      int tSentID = Integer.parseInt(fds[0]);
      if (sentID != tSentID) {
        throw new RuntimeException("sentence_id does not match");
      }
      String hypothesis = (fds.length == 4) ? fds[1] : "";
      hypsItself.add(hypothesis);

      String[] words = Regex.spaces.split(hypothesis);
      sentLens.add(words.length);

      HashMap<String, Integer> ngramTbl = new HashMap<String, Integer>();
      Ngram.getNgrams(ngramTbl, 1, bleuOrder, words);
      ngramTbls.add(ngramTbl);

      // l_feat_scores.add(fds[2]);

      // The value of finalIndex is expected to be 3,
      // unless the hyp_itself is empty,
      // in which case finalIndex will be 2.
      int finalIndex = fds.length - 1;
      baselineScores.add(Double.parseDouble(fds[finalIndex]));
    }

    // step-1: get normalized distribution

    /** value in baselineScores will be changed to normalized probability */
    computeNormalizedProbs(baselineScores, scalingFactor);

    List<Double> normalizedProbs = baselineScores;

    // === required by google linear corpus gain
    HashMap<String, Double> posteriorCountsTbl = null;
    if (useGoogleLinearCorpusGain) {
      posteriorCountsTbl = new HashMap<String, Double>();
      getGooglePosteriorCounts(ngramTbls, normalizedProbs, posteriorCountsTbl);
    }

    // step-2: rerank the nbest
    /**
     * TODO: zhifei: now the re-ranking takes O(n^2) where n is the size of the nbest. But, we can
     * significantly speed up this (leadding to O(n)) by first estimating a model on nbest, and then
     * rerank the nbest using the estimated model.
     */
    double bestGain = -1000000000; // set as worst gain
    String bestHyp = null;
    List<Double> gains = new ArrayList<Double>();
    for (int i = 0; i < hypsItself.size(); i++) {
      String curHyp = hypsItself.get(i);
      int curHypLen = sentLens.get(i);
      HashMap<String, Integer> curHypNgramTbl = ngramTbls.get(i);
      // double cur_gain = computeGain(cur_hyp, l_hyp_itself, l_normalized_probs);
      double curGain = 0;
      if (useGoogleLinearCorpusGain) {
        curGain = computeExpectedLinearCorpusGain(curHypLen, curHypNgramTbl, posteriorCountsTbl);
      } else {
        curGain =
            computeExpectedGain(curHypLen, curHypNgramTbl, ngramTbls, sentLens, normalizedProbs);
      }

      gains.add(curGain);
      if (i == 0 || curGain > bestGain) { // maximize
        bestGain = curGain;
        bestHyp = curHyp;
      }
    }

    // step-3: output the 1best or nbest
    if (this.produceRerankedNbest) {
      // TOTO: sort the list and write the reranked nbest; Use Collections.sort(List list,
      // Comparator c)
    } else {
      /*
       * this.out.write(best_hyp); this.out.write("\n"); out.flush();
       */
    }

    System.err.println("best gain: " + bestGain);
    if (null == bestHyp) {
      throw new RuntimeException("mbr reranked one best is null, must be wrong");
    }
    return bestHyp;
  }