private static Set<String> extractVocabulary(List<TaggedSentence> taggedSentences) { Set<String> vocabulary = new HashSet<String>(); for (TaggedSentence taggedSentence : taggedSentences) { List<String> words = taggedSentence.getWords(); vocabulary.addAll(words); } return vocabulary; }
private CounterMap<String,String> trainEM(int maxIterations) { Set<String> englishVocab = new HashSet<String>(); Set<String> frenchVocab = new HashSet<String>(); CounterMap<String,String> translations = new CounterMap<String,String>(); englishVocab.add(NULL); int iteration = 0; final double thresholdProb = 0.0001; for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); // add words from list to vocabulary sets englishVocab.addAll(englishWords); frenchVocab.addAll(frenchWords); } System.out.println("Ready"); // We need to initialize translations.getCount(f,e) uniformly // t(f|e) summed over all e in {E + NULL} = 1 final double initialCount = 1.0 / englishVocab.size(); while(iteration < maxIterations) { CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e // E-step: loop over all sentences and update counts for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); Counter<String> sTotalF = new Counter<String>(); // compute normalization constant sTotalF for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // initialize and compute for English = NULL if (!translations.containsKey(f) && initialize) translations.setCount(f, NULL, initialCount); else if (!translations.containsKey(f)) translations.setCount(f, NULL, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, NULL)); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); if (!(translations.getCounter(f)).containsKey(e) && initialize) translations.setCount(f, e, initialCount); else if (!(translations.getCounter(f)).containsKey(e)) translations.setCount(f, e, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, e)); } } // collect counts in counts and totalEnglish for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // collect counts for English = NULL double count = translations.getCount(f, NULL) / sTotalF.getCount(f); counts.incrementCount(NULL, f, count); totalEnglish.incrementCount(NULL, count); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); count = translations.getCount(f, e) / sTotalF.getCount(f); counts.incrementCount(e, f, count); totalEnglish.incrementCount(e, count); } } } // end of E-step System.out.println("Completed E-step"); // M-step: update probabilities with counts from E-step and check for convergence iteration++; for (String e : counts.keySet()) {//englishVocab) { double normalizer = totalEnglish.getCount(e); for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) { // To speed implementation, we want to update translations only when count / normalizer > threshold double prob = counts.getCount(e, f) / normalizer; if (!initialize) { if (prob > thresholdProb) translations.setCount(f, e, prob); else (translations.getCounter(f)).removeKey(e); } else { translations.setCount(f, e, prob); } } } System.out.println("Completed iteration " + iteration); } // end of M-step System.out.println("Trained!"); return translations; }