/** * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and * Martin sect. 4.5.3. */ public void smoothAndNormalize() { Counter<Integer> cntCounter = new Counter<Integer>(); for (K tok : lm.keySet()) { int cnt = (int) lm.getCount(tok); cntCounter.incrementCount(cnt); } final double[] coeffs = runLogSpaceRegression(cntCounter); UNK_PROB = cntCounter.getCount(1) / lm.totalCount(); for (K tok : lm.keySet()) { double tokCnt = lm.getCount(tok); if (tokCnt <= unkCutoff) // Treat as unknown unkTokens.add(tok); if (tokCnt <= kCutoff) { // Smooth double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs); lm.setCount(tok, cSmooth); } } // Normalize // Counters.normalize(lm); // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME }
private double getDiceCoefficient(String f, String e) { double intersection = collocationCountSentences.getCount(f,e); double cardinalityF = fCountSentences.getCount(f); double cardinalityE = eCountSentences.getCount(e); double dice = 2*intersection / (cardinalityF + cardinalityE); return dice; }
/* Returns a smoothed estimate of P(word|tag) */ public double scoreTagging(String word, String tag) { double p_tag = tagCounter.getCount(tag) / totalTokens; double c_word = wordCounter.getCount(word); double c_tag_and_word = wordToTagCounters.getCount(word, tag); if (c_word < 10) { // rare or unknown c_word += 1.0; c_tag_and_word += typeTagCounter.getCount(tag) / totalWordTypes; } double p_word = (1.0 + c_word) / (totalTokens + totalWordTypes); double p_tag_given_word = c_tag_and_word / c_word; return p_tag_given_word / p_tag * p_word; }
/** * Builds a Trellis over a sentence, by starting at the state State, and advancing through all * legal extensions of each state already in the trellis. You should not have to modify this * code (or even read it, really). */ private Trellis<State> buildTrellis(List<String> sentence) { Trellis<State> trellis = new Trellis<State>(); trellis.setStartState(State.getStartState()); State stopState = State.getStopState(sentence.size() + 2); trellis.setStopState(stopState); Set<State> states = Collections.singleton(State.getStartState()); for (int position = 0; position <= sentence.size() + 1; position++) { Set<State> nextStates = new HashSet<State>(); for (State state : states) { if (state.equals(stopState)) continue; LocalTrigramContext localTrigramContext = new LocalTrigramContext( sentence, position, state.getPreviousPreviousTag(), state.getPreviousTag()); Counter<String> tagScores = localTrigramScorer.getLogScoreCounter(localTrigramContext); for (String tag : tagScores.keySet()) { double score = tagScores.getCount(tag); State nextState = state.getNextState(tag); trellis.setTransitionCount(state, nextState, score); nextStates.add(nextState); } } // System.out.println("States: "+nextStates); states = nextStates; } return trellis; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxConditionalProb = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e)); if (conditionalGivenEnglish > maxConditionalProb) { maxConditionalProb = conditionalGivenEnglish; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
private double katzEstimate(Counter<Integer> cnt, double c, double[] coeffs) { double nC = cnt.getCount((int) c); double nC1 = cnt.getCount(((int) c) + 1); if (nC1 == 0.0) nC1 = Math.exp(coeffs[0] + (coeffs[1] * (c + 1.0))); double n1 = cnt.getCount(1); double nK1 = cnt.getCount(((int) kCutoff) + 1); if (nK1 == 0.0) nK1 = Math.exp(coeffs[0] + (coeffs[1] * (kCutoff + 1.0))); double kTerm = (kCutoff + 1.0) * (nK1 / n1); double cTerm = (c + 1.0) * (nC1 / nC); double cSmooth = (cTerm - (c * kTerm)) / (1.0 - kTerm); return cSmooth; }
public double getCount(K token) { if (!lm.keySet().contains(token)) { System.err.println(lm.keySet().size()); throw new RuntimeException("token not in keyset"); } return lm.getCount(token); }
/* A builds PCFG using the observed counts of binary and unary * productions in the training trees to estimate the probabilities * for those rules. */ public Grammar(List<Tree<String>> trainTrees) { Counter<UnaryRule> unaryRuleCounter = new Counter<UnaryRule>(); Counter<BinaryRule> binaryRuleCounter = new Counter<BinaryRule>(); Counter<String> symbolCounter = new Counter<String>(); for (Tree<String> trainTree : trainTrees) { tallyTree(trainTree, symbolCounter, unaryRuleCounter, binaryRuleCounter); } for (UnaryRule unaryRule : unaryRuleCounter.keySet()) { double unaryProbability = unaryRuleCounter.getCount(unaryRule) / symbolCounter.getCount(unaryRule.getParent()); unaryRule.setScore(unaryProbability); addUnary(unaryRule); } for (BinaryRule binaryRule : binaryRuleCounter.keySet()) { double binaryProbability = binaryRuleCounter.getCount(binaryRule) / symbolCounter.getCount(binaryRule.getParent()); binaryRule.setScore(binaryProbability); addBinary(binaryRule); } }
private double[] runLogSpaceRegression(Counter<Integer> cntCounter) { SimpleRegression reg = new SimpleRegression(); for (int cnt : cntCounter.keySet()) { reg.addData(cnt, Math.log(cntCounter.getCount(cnt))); } // System.out.println(reg.getIntercept()); // System.out.println(reg.getSlope()); // System.out.println(regression.getSlopeStdErr()); double[] coeffs = new double[] {reg.getIntercept(), reg.getSlope()}; return coeffs; }
/** * Scores a tagging for a sentence. Note that a tag sequence not accepted by the markov process * should receive a log score of Double.NEGATIVE_INFINITY. */ public double scoreTagging(TaggedSentence taggedSentence) { double logScore = 0.0; List<LabeledLocalTrigramContext> labeledLocalTrigramContexts = extractLabeledLocalTrigramContexts(taggedSentence); for (LabeledLocalTrigramContext labeledLocalTrigramContext : labeledLocalTrigramContexts) { Counter<String> logScoreCounter = localTrigramScorer.getLogScoreCounter(labeledLocalTrigramContext); String currentTag = labeledLocalTrigramContext.getCurrentTag(); if (logScoreCounter.containsKey(currentTag)) { logScore += logScoreCounter.getCount(currentTag); } else { logScore += Double.NEGATIVE_INFINITY; } } return logScore; }
public Counter<String> getLogScoreCounter(LocalTrigramContext localTrigramContext) { int position = localTrigramContext.getPosition(); String word = localTrigramContext.getWords().get(position); Counter<String> tagCounter = unknownWordTags; if (wordsToTags.keySet().contains(word)) { tagCounter = wordsToTags.getCounter(word); } Set<String> allowedFollowingTags = allowedFollowingTags( tagCounter.keySet(), localTrigramContext.getPreviousPreviousTag(), localTrigramContext.getPreviousTag()); Counter<String> logScoreCounter = new Counter<String>(); for (String tag : tagCounter.keySet()) { double logScore = Math.log(tagCounter.getCount(tag)); if (!restrictTrigrams || allowedFollowingTags.isEmpty() || allowedFollowingTags.contains(tag)) logScoreCounter.setCount(tag, logScore); } return logScoreCounter; }
private Duple<CrownOperations.Reason, ISynset> getEstimatedSynonym( String targetLemma, Set<String> synonyms, POS pos, String gloss) { Counter<ISynset> synsetCounts = new ObjectCounter<ISynset>(); List<String> lemmasInWn = new ArrayList<String>(); for (String lemma : synonyms) { // Get the WordNet sysnet if it exists Set<ISynset> senses = WordNetUtils.getSynsets(dict, lemma, pos); if (senses.isEmpty()) continue; lemmasInWn.add(lemma); synsetCounts.countAll(senses); // Get the hypernyms of the synset and count their occurrence too for (ISynset synset : senses) { // Do a sanity check that avoids attaching this Entry if its // lemma appears anywhere near the synonoyms. This check // potentially has some false positives since we might avoid // putting the lemma somewhere valid (in which case it would // have more than would valid location) but is used to avoid // noisy integration if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, synset)) { return null; } for (ISynsetID hyper : synset.getRelatedSynsets(Pointer.HYPERNYM)) { ISynset hyperSyn = dict.getSynset(hyper); if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, hyperSyn)) { return null; } synsetCounts.count(hyperSyn); } } } // Return null if we couldn't find any of the lemma's synonyms or // hyponyms in WordNet if (synsetCounts.items().isEmpty()) return null; // If there was only one lemma in this list in WordNet, try comparing // the glosses for just that word to find a match if (lemmasInWn.size() == 1) { double maxScore = 0; ISynset best = null; String bestGloss = null; Set<ISynset> candidateSynonymSynsets = WordNetUtils.getSynsets(dict, lemmasInWn.get(0), pos); for (ISynset candidate : candidateSynonymSynsets) { String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate); double score = simFunc.compare(gloss, wnExtendedGloss); if (maxScore < score) { maxScore = score; best = candidate; bestGloss = wnExtendedGloss; } } CrownOperations.Reason r = new CrownOperations.Reason(getClass()); r.set("relation_type", "synonym"); r.set("heuristic", "single-synonym"); r.set("max_score", maxScore); return new Duple<CrownOperations.Reason, ISynset>(r, best); } else { // Check for whether there were ties in the max ISynset mostFreq = synsetCounts.max(); int mostFreqCount = synsetCounts.getCount(mostFreq); List<ISynset> ties = new ArrayList<ISynset>(); for (ISynset syn : synsetCounts.items()) { int c = synsetCounts.getCount(syn); if (c == mostFreqCount) ties.add(syn); } // If there was only one synset that had the maximum count, then we // report this if (ties.size() == 1) { CrownOperations.Reason r = new CrownOperations.Reason(getClass()); r.set("relation_type", "synonym"); r.set("heuristic", "unambiguous-max"); r.set("count", mostFreqCount); return new Duple<CrownOperations.Reason, ISynset>(r, mostFreq); } // Otherwise, we try breaking ties between the synsets using gloss // similarity else { double maxScore = 0; ISynset best = null; String bestGloss = null; for (ISynset candidate : ties) { String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate); double score = simFunc.compare(gloss, wnExtendedGloss); if (maxScore < score) { maxScore = score; best = candidate; bestGloss = wnExtendedGloss; } } CrownOperations.Reason r = new CrownOperations.Reason(getClass()); r.set("relation_type", "synonym"); r.set("heuristic", "tied-synonyms"); r.set("max_score", maxScore); return new Duple<CrownOperations.Reason, ISynset>(r, best); } } }
private CounterMap<String,String> trainEM(int maxIterations) { Set<String> englishVocab = new HashSet<String>(); Set<String> frenchVocab = new HashSet<String>(); CounterMap<String,String> translations = new CounterMap<String,String>(); englishVocab.add(NULL); int iteration = 0; final double thresholdProb = 0.0001; for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); // add words from list to vocabulary sets englishVocab.addAll(englishWords); frenchVocab.addAll(frenchWords); } System.out.println("Ready"); // We need to initialize translations.getCount(f,e) uniformly // t(f|e) summed over all e in {E + NULL} = 1 final double initialCount = 1.0 / englishVocab.size(); while(iteration < maxIterations) { CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e // E-step: loop over all sentences and update counts for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); Counter<String> sTotalF = new Counter<String>(); // compute normalization constant sTotalF for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // initialize and compute for English = NULL if (!translations.containsKey(f) && initialize) translations.setCount(f, NULL, initialCount); else if (!translations.containsKey(f)) translations.setCount(f, NULL, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, NULL)); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); if (!(translations.getCounter(f)).containsKey(e) && initialize) translations.setCount(f, e, initialCount); else if (!(translations.getCounter(f)).containsKey(e)) translations.setCount(f, e, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, e)); } } // collect counts in counts and totalEnglish for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // collect counts for English = NULL double count = translations.getCount(f, NULL) / sTotalF.getCount(f); counts.incrementCount(NULL, f, count); totalEnglish.incrementCount(NULL, count); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); count = translations.getCount(f, e) / sTotalF.getCount(f); counts.incrementCount(e, f, count); totalEnglish.incrementCount(e, count); } } } // end of E-step System.out.println("Completed E-step"); // M-step: update probabilities with counts from E-step and check for convergence iteration++; for (String e : counts.keySet()) {//englishVocab) { double normalizer = totalEnglish.getCount(e); for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) { // To speed implementation, we want to update translations only when count / normalizer > threshold double prob = counts.getCount(e, f) / normalizer; if (!initialize) { if (prob > thresholdProb) translations.setCount(f, e, prob); else (translations.getCounter(f)).removeKey(e); } else { translations.setCount(f, e, prob); } } } System.out.println("Completed iteration " + iteration); } // end of M-step System.out.println("Trained!"); return translations; }
public double getProb(K token) { if (unkTokens.contains(token) || !lm.containsKey(token)) return UNK_PROB; return lm.getCount(token); }