public String processOneSent(List<String> nbest, int sentID) { System.err.println("Now process sentence " + sentID); // step-0: preprocess // assumption: each hyp has a formate: // "sent_id ||| hyp_itself ||| feature scores ||| linear-combination-of-feature-scores(this // should be logP)" List<String> hypsItself = new ArrayList<String>(); // ArrayList<String> l_feat_scores = new ArrayList<String>(); List<Double> baselineScores = new ArrayList<Double>(); // linear combination of all baseline // features List<HashMap<String, Integer>> ngramTbls = new ArrayList<HashMap<String, Integer>>(); List<Integer> sentLens = new ArrayList<Integer>(); for (String hyp : nbest) { String[] fds = Regex.threeBarsWithSpace.split(hyp); int tSentID = Integer.parseInt(fds[0]); if (sentID != tSentID) { throw new RuntimeException("sentence_id does not match"); } String hypothesis = (fds.length == 4) ? fds[1] : ""; hypsItself.add(hypothesis); String[] words = Regex.spaces.split(hypothesis); sentLens.add(words.length); HashMap<String, Integer> ngramTbl = new HashMap<String, Integer>(); Ngram.getNgrams(ngramTbl, 1, bleuOrder, words); ngramTbls.add(ngramTbl); // l_feat_scores.add(fds[2]); // The value of finalIndex is expected to be 3, // unless the hyp_itself is empty, // in which case finalIndex will be 2. int finalIndex = fds.length - 1; baselineScores.add(Double.parseDouble(fds[finalIndex])); } // step-1: get normalized distribution /** value in baselineScores will be changed to normalized probability */ computeNormalizedProbs(baselineScores, scalingFactor); List<Double> normalizedProbs = baselineScores; // === required by google linear corpus gain HashMap<String, Double> posteriorCountsTbl = null; if (useGoogleLinearCorpusGain) { posteriorCountsTbl = new HashMap<String, Double>(); getGooglePosteriorCounts(ngramTbls, normalizedProbs, posteriorCountsTbl); } // step-2: rerank the nbest /** * TODO: zhifei: now the re-ranking takes O(n^2) where n is the size of the nbest. But, we can * significantly speed up this (leadding to O(n)) by first estimating a model on nbest, and then * rerank the nbest using the estimated model. */ double bestGain = -1000000000; // set as worst gain String bestHyp = null; List<Double> gains = new ArrayList<Double>(); for (int i = 0; i < hypsItself.size(); i++) { String curHyp = hypsItself.get(i); int curHypLen = sentLens.get(i); HashMap<String, Integer> curHypNgramTbl = ngramTbls.get(i); // double cur_gain = computeGain(cur_hyp, l_hyp_itself, l_normalized_probs); double curGain = 0; if (useGoogleLinearCorpusGain) { curGain = computeExpectedLinearCorpusGain(curHypLen, curHypNgramTbl, posteriorCountsTbl); } else { curGain = computeExpectedGain(curHypLen, curHypNgramTbl, ngramTbls, sentLens, normalizedProbs); } gains.add(curGain); if (i == 0 || curGain > bestGain) { // maximize bestGain = curGain; bestHyp = curHyp; } } // step-3: output the 1best or nbest if (this.produceRerankedNbest) { // TOTO: sort the list and write the reranked nbest; Use Collections.sort(List list, // Comparator c) } else { /* * this.out.write(best_hyp); this.out.write("\n"); out.flush(); */ } System.err.println("best gain: " + bestGain); if (null == bestHyp) { throw new RuntimeException("mbr reranked one best is null, must be wrong"); } return bestHyp; }