private void tallyTree( Tree<String> tree, Counter<String> symbolCounter, Counter<UnaryRule> unaryRuleCounter, Counter<BinaryRule> binaryRuleCounter) { if (tree.isLeaf()) return; if (tree.isPreTerminal()) return; if (tree.getChildren().size() == 1) { UnaryRule unaryRule = makeUnaryRule(tree); symbolCounter.incrementCount(tree.getLabel(), 1.0); unaryRuleCounter.incrementCount(unaryRule, 1.0); } if (tree.getChildren().size() == 2) { BinaryRule binaryRule = makeBinaryRule(tree); symbolCounter.incrementCount(tree.getLabel(), 1.0); binaryRuleCounter.incrementCount(binaryRule, 1.0); } if (tree.getChildren().size() < 1 || tree.getChildren().size() > 2) { throw new RuntimeException( "Attempted to construct a Grammar with an illegal tree: " + tree); } for (Tree<String> child : tree.getChildren()) { tallyTree(child, symbolCounter, unaryRuleCounter, binaryRuleCounter); } }
private void tallyTagging(String word, String tag) { if (!isKnown(word)) { totalWordTypes += 1.0; typeTagCounter.incrementCount(tag, 1.0); } totalTokens += 1.0; tagCounter.incrementCount(tag, 1.0); wordCounter.incrementCount(word, 1.0); wordToTagCounters.incrementCount(word, tag, 1.0); }
/** * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and * Martin sect. 4.5.3. */ public void smoothAndNormalize() { Counter<Integer> cntCounter = new Counter<Integer>(); for (K tok : lm.keySet()) { int cnt = (int) lm.getCount(tok); cntCounter.incrementCount(cnt); } final double[] coeffs = runLogSpaceRegression(cntCounter); UNK_PROB = cntCounter.getCount(1) / lm.totalCount(); for (K tok : lm.keySet()) { double tokCnt = lm.getCount(tok); if (tokCnt <= unkCutoff) // Treat as unknown unkTokens.add(tok); if (tokCnt <= kCutoff) { // Smooth double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs); lm.setCount(tok, cSmooth); } } // Normalize // Counters.normalize(lm); // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME }
public void train(List<LabeledLocalTrigramContext> labeledLocalTrigramContexts) { // collect word-tag counts for (LabeledLocalTrigramContext labeledLocalTrigramContext : labeledLocalTrigramContexts) { String word = labeledLocalTrigramContext.getCurrentWord(); String tag = labeledLocalTrigramContext.getCurrentTag(); if (!wordsToTags.keySet().contains(word)) { // word is currently unknown, so tally its tag in the unknown tag counter unknownWordTags.incrementCount(tag, 1.0); } wordsToTags.incrementCount(word, tag, 1.0); seenTagTrigrams.add( makeTrigramString( labeledLocalTrigramContext.getPreviousPreviousTag(), labeledLocalTrigramContext.getPreviousTag(), labeledLocalTrigramContext.getCurrentTag())); } wordsToTags = Counters.conditionalNormalize(wordsToTags); unknownWordTags = Counters.normalize(unknownWordTags); }
private CounterMap<String,String> trainEM(int maxIterations) { Set<String> englishVocab = new HashSet<String>(); Set<String> frenchVocab = new HashSet<String>(); CounterMap<String,String> translations = new CounterMap<String,String>(); englishVocab.add(NULL); int iteration = 0; final double thresholdProb = 0.0001; for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); // add words from list to vocabulary sets englishVocab.addAll(englishWords); frenchVocab.addAll(frenchWords); } System.out.println("Ready"); // We need to initialize translations.getCount(f,e) uniformly // t(f|e) summed over all e in {E + NULL} = 1 final double initialCount = 1.0 / englishVocab.size(); while(iteration < maxIterations) { CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e // E-step: loop over all sentences and update counts for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); Counter<String> sTotalF = new Counter<String>(); // compute normalization constant sTotalF for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // initialize and compute for English = NULL if (!translations.containsKey(f) && initialize) translations.setCount(f, NULL, initialCount); else if (!translations.containsKey(f)) translations.setCount(f, NULL, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, NULL)); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); if (!(translations.getCounter(f)).containsKey(e) && initialize) translations.setCount(f, e, initialCount); else if (!(translations.getCounter(f)).containsKey(e)) translations.setCount(f, e, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, e)); } } // collect counts in counts and totalEnglish for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // collect counts for English = NULL double count = translations.getCount(f, NULL) / sTotalF.getCount(f); counts.incrementCount(NULL, f, count); totalEnglish.incrementCount(NULL, count); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); count = translations.getCount(f, e) / sTotalF.getCount(f); counts.incrementCount(e, f, count); totalEnglish.incrementCount(e, count); } } } // end of E-step System.out.println("Completed E-step"); // M-step: update probabilities with counts from E-step and check for convergence iteration++; for (String e : counts.keySet()) {//englishVocab) { double normalizer = totalEnglish.getCount(e); for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) { // To speed implementation, we want to update translations only when count / normalizer > threshold double prob = counts.getCount(e, f) / normalizer; if (!initialize) { if (prob > thresholdProb) translations.setCount(f, e, prob); else (translations.getCounter(f)).removeKey(e); } else { translations.setCount(f, e, prob); } } } System.out.println("Completed iteration " + iteration); } // end of M-step System.out.println("Trained!"); return translations; }
public void incrementCount(K token) { lm.incrementCount(token); }