public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxDice = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double dice = getDiceCoefficient(f,e); if (dice > maxDice) { maxDice = dice; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
private static void test(WordAligner wordAligner, List<SentencePair> testSentencePairs, Map<Integer, Alignment> testAlignments, boolean verbose) { int proposedSureCount = 0; int proposedPossibleCount = 0; int sureCount = 0; int proposedCount = 0; for (SentencePair sentencePair : testSentencePairs) { Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair); Alignment referenceAlignment = testAlignments.get(sentencePair.getSentenceID()); if (referenceAlignment == null) throw new RuntimeException("No reference alignment found for sentenceID "+sentencePair.getSentenceID()); if (verbose) System.out.println("Alignment:\n"+Alignment.render(referenceAlignment,proposedAlignment,sentencePair)); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { boolean proposed = proposedAlignment.containsSureAlignment(englishPosition, frenchPosition); boolean sure = referenceAlignment.containsSureAlignment(englishPosition, frenchPosition); boolean possible = referenceAlignment.containsPossibleAlignment(englishPosition, frenchPosition); if (proposed && sure) proposedSureCount += 1; if (proposed && possible) proposedPossibleCount += 1; if (proposed) proposedCount += 1; if (sure) sureCount += 1; } } } System.out.println("Precision: "+proposedPossibleCount/(double)proposedCount); System.out.println("Recall: "+proposedSureCount/(double)sureCount); System.out.println("AER: "+(1.0-(proposedSureCount+proposedPossibleCount)/(double)(sureCount+proposedCount))); }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); // Model 1 assumes all alignments are equally likely // So we can just take the argMax of t(f|e) to get the englishMaxPosition for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = -1; double maxTranslationProb = translationProbs.getCount(f, NULL); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double translationProb = translationProbs.getCount(f, e); if (translationProb > maxTranslationProb) { maxTranslationProb = translationProb; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxConditionalProb = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e)); if (conditionalGivenEnglish > maxConditionalProb) { maxConditionalProb = conditionalGivenEnglish; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); int numFrenchWords = sentencePair.getFrenchWords().size(); int numEnglishWords = sentencePair.getEnglishWords().size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { int englishPosition = frenchPosition; if (englishPosition >= numEnglishWords) englishPosition = -1; alignment.addAlignment(englishPosition, frenchPosition, true); } return alignment; }
private static void predict(WordAligner wordAligner, List<SentencePair> testSentencePairs, String path) throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for (SentencePair sentencePair : testSentencePairs) { Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { if (proposedAlignment.containsSureAlignment(englishPosition, frenchPosition)) { writer.write(frenchPosition + "-" + englishPosition + " "); } } } writer.write("\n"); } writer.close(); }
private void trainCounters() { for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); //fCounts.incrementAll(frenchWords, 1.0); // won't affect the argMax eCounts.incrementAll(englishWords, 1.0); for (String f: frenchWords) { for (String e: englishWords) collocationCounts.incrementCount(f, e, 1.0); } } System.out.println("Trained!"); }
private void trainCounters() { for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); Set<String> frenchSet = new HashSet<String>(frenchWords); Set<String> englishSet = new HashSet<String>(englishWords); fCountSentences.incrementAll(frenchSet, 1.0); eCountSentences.incrementAll(englishSet, 1.0); for (String f: frenchSet) { for (String e: englishSet) collocationCountSentences.incrementCount(f, e, 1.0); } } System.out.println("Trained!"); }
public static String render(Alignment reference, Alignment proposed, SentencePair sentencePair) { StringBuilder sb = new StringBuilder(); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { boolean sure = reference.containsSureAlignment(englishPosition, frenchPosition); boolean possible = reference.containsPossibleAlignment(englishPosition, frenchPosition); char proposedChar = ' '; if (proposed.containsSureAlignment(englishPosition, frenchPosition)) proposedChar = '#'; if (sure) { sb.append('['); sb.append(proposedChar); sb.append(']'); } else { if (possible) { sb.append('('); sb.append(proposedChar); sb.append(')'); } else { sb.append(' '); sb.append(proposedChar); sb.append(' '); } } } sb.append("| "); sb.append(sentencePair.getFrenchWords().get(frenchPosition)); sb.append('\n'); } for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { sb.append("---"); } sb.append("'\n"); boolean printed = true; int index = 0; while (printed) { printed = false; StringBuilder lineSB = new StringBuilder(); for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { String englishWord = sentencePair.getEnglishWords().get(englishPosition); if (englishWord.length() > index) { printed = true; lineSB.append(' '); lineSB.append(englishWord.charAt(index)); lineSB.append(' '); } else { lineSB.append(" "); } } index += 1; if (printed) { sb.append(lineSB); sb.append('\n'); } } return sb.toString(); }
private CounterMap<String,String> trainEM(int maxIterations) { Set<String> englishVocab = new HashSet<String>(); Set<String> frenchVocab = new HashSet<String>(); CounterMap<String,String> translations = new CounterMap<String,String>(); englishVocab.add(NULL); int iteration = 0; final double thresholdProb = 0.0001; for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); // add words from list to vocabulary sets englishVocab.addAll(englishWords); frenchVocab.addAll(frenchWords); } System.out.println("Ready"); // We need to initialize translations.getCount(f,e) uniformly // t(f|e) summed over all e in {E + NULL} = 1 final double initialCount = 1.0 / englishVocab.size(); while(iteration < maxIterations) { CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e // E-step: loop over all sentences and update counts for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); Counter<String> sTotalF = new Counter<String>(); // compute normalization constant sTotalF for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // initialize and compute for English = NULL if (!translations.containsKey(f) && initialize) translations.setCount(f, NULL, initialCount); else if (!translations.containsKey(f)) translations.setCount(f, NULL, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, NULL)); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); if (!(translations.getCounter(f)).containsKey(e) && initialize) translations.setCount(f, e, initialCount); else if (!(translations.getCounter(f)).containsKey(e)) translations.setCount(f, e, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, e)); } } // collect counts in counts and totalEnglish for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // collect counts for English = NULL double count = translations.getCount(f, NULL) / sTotalF.getCount(f); counts.incrementCount(NULL, f, count); totalEnglish.incrementCount(NULL, count); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); count = translations.getCount(f, e) / sTotalF.getCount(f); counts.incrementCount(e, f, count); totalEnglish.incrementCount(e, count); } } } // end of E-step System.out.println("Completed E-step"); // M-step: update probabilities with counts from E-step and check for convergence iteration++; for (String e : counts.keySet()) {//englishVocab) { double normalizer = totalEnglish.getCount(e); for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) { // To speed implementation, we want to update translations only when count / normalizer > threshold double prob = counts.getCount(e, f) / normalizer; if (!initialize) { if (prob > thresholdProb) translations.setCount(f, e, prob); else (translations.getCounter(f)).removeKey(e); } else { translations.setCount(f, e, prob); } } } System.out.println("Completed iteration " + iteration); } // end of M-step System.out.println("Trained!"); return translations; }