public double scoreSentence(List<String> sentence) { double p = 0; int M = this.baseModel.getLmOrder(); List<String> normalizedSentence = new ArrayList<String>(); final List<String> sentenceWithBounds = new BoundedList<String>( sentence, baseModel.getWordIndexer().getStartSymbol(), baseModel.getWordIndexer().getEndSymbol()); final int lmOrder = baseModel.getLmOrder() + 1; float sentenceScore = 0.0f; float previousScore = 0.0f; // beginnetje for (int i = 1; i < lmOrder - 1 && i <= sentenceWithBounds.size() + 1; ++i) { final List<String> ngram = sentenceWithBounds.subList(-1, i); final float scoreNgram = (float) getLogProb(ngram, previousScore); sentenceScore += scoreNgram; previousScore = scoreNgram; } // rest for (int i = lmOrder - 1; i < sentenceWithBounds.size() + 2; ++i) { final List<String> ngram = sentenceWithBounds.subList(i - lmOrder, i); final float scoreNgram = (float) getLogProb(ngram, previousScore); sentenceScore += scoreNgram; previousScore = scoreNgram; } return sentenceScore; }
// this does NOT work correctly... (stop this line) // public float getLogProb( List<String> ngram, int boundaryPosition, double previousScore) // bp zit op index waarna splitsing { float p = 0; float basep = baseModel.getLogProb(ngram); float baseP = (float) Math.exp(basep); if (boundaryPosition == ngram.size() - 1) // logP=0 to defer LM scoring until the next token (dit kan dus niet meneer) { System.err.println("Hyphen last in: " + ngram); return basep; } if (boundaryPosition < 0) // no line boundary: refer to base model { System.err.println("Nothing special: " + ngram); return basep; } // we need to join parts String p1 = ngram.get(boundaryPosition); String p2 = ngram.get(boundaryPosition + 1); String P1 = p1.replaceAll("-$", ""); String w = P1 + p2; double pHyph = hyphenationDictionary.getHyphenationProbability(w, p1, p2); List<String> joined = new ArrayList<String>(); for (int i = 0; i < boundaryPosition; i++) joined.add(ngram.get(i)); joined.add(w); for (int i = boundaryPosition + 2; i < ngram.size(); i++) joined.add(ngram.get(i)); System.err.println("Joined ngram: " + joined); double joinedP = Math.exp(baseModel.getLogProb(joined)); if (boundaryPosition == ngram.size() - 2) { System.err.println("Penultimate, dividing " + joinedP + " by " + Math.exp(previousScore)); joinedP /= Math.exp(previousScore); } System.err.println( ngram + " joined: (" + boundaryPosition + ") " + joinedP + " apart: " + baseP + " pHyph " + pHyph); return (float) Math.log((1 - pHyph) * baseP + pHyph * joinedP); }