public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); // Model 1 assumes all alignments are equally likely // So we can just take the argMax of t(f|e) to get the englishMaxPosition for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = -1; double maxTranslationProb = translationProbs.getCount(f, NULL); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double translationProb = translationProbs.getCount(f, e); if (translationProb > maxTranslationProb) { maxTranslationProb = translationProb; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxDice = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double dice = getDiceCoefficient(f,e); if (dice > maxDice) { maxDice = dice; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxConditionalProb = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e)); if (conditionalGivenEnglish > maxConditionalProb) { maxConditionalProb = conditionalGivenEnglish; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public String toString() { StringBuilder sb = new StringBuilder(); for (int position = 0; position < words.size(); position++) { String word = words.get(position); String tag = tags.get(position); sb.append(word); sb.append("_"); sb.append(tag); } return sb.toString(); }
private static void labelTestSet( POSTagger posTagger, List<TaggedSentence> testSentences, String path) throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for (TaggedSentence sentence : testSentences) { List<String> words = sentence.getWords(); List<String> guessedTags = posTagger.tag(words); for (int i = 0; i < words.size(); i++) { writer.write(words.get(i) + "\t" + guessedTags.get(i) + "\n"); } writer.write("\n"); } writer.close(); }
private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts( TaggedSentence taggedSentence) { List<LabeledLocalTrigramContext> labeledLocalTrigramContexts = new ArrayList<LabeledLocalTrigramContext>(); List<String> words = new BoundedList<String>(taggedSentence.getWords(), START_WORD, STOP_WORD); List<String> tags = new BoundedList<String>(taggedSentence.getTags(), START_TAG, STOP_TAG); for (int position = 0; position <= taggedSentence.size() + 1; position++) { labeledLocalTrigramContexts.add( new LabeledLocalTrigramContext( words, position, tags.get(position - 2), tags.get(position - 1), tags.get(position))); } return labeledLocalTrigramContexts; }
private static void evaluateTagger( POSTagger posTagger, List<TaggedSentence> taggedSentences, Set<String> trainingVocabulary, boolean verbose) { double numTags = 0.0; double numTagsCorrect = 0.0; double numUnknownWords = 0.0; double numUnknownWordsCorrect = 0.0; int numDecodingInversions = 0; for (TaggedSentence taggedSentence : taggedSentences) { List<String> words = taggedSentence.getWords(); List<String> goldTags = taggedSentence.getTags(); List<String> guessedTags = posTagger.tag(words); for (int position = 0; position < words.size() - 1; position++) { String word = words.get(position); String goldTag = goldTags.get(position); String guessedTag = guessedTags.get(position); if (guessedTag.equals(goldTag)) numTagsCorrect += 1.0; numTags += 1.0; if (!trainingVocabulary.contains(word)) { if (guessedTag.equals(goldTag)) numUnknownWordsCorrect += 1.0; numUnknownWords += 1.0; } } double scoreOfGoldTagging = posTagger.scoreTagging(taggedSentence); double scoreOfGuessedTagging = posTagger.scoreTagging(new TaggedSentence(words, guessedTags)); if (scoreOfGoldTagging > scoreOfGuessedTagging) { numDecodingInversions++; if (verbose) System.out.println( "WARNING: Decoder suboptimality detected. Gold tagging has higher score than guessed tagging."); } if (verbose) System.out.println(alignedTaggings(words, goldTags, guessedTags, true) + "\n"); } System.out.println( "Tag Accuracy: " + (numTagsCorrect / numTags) + " (Unknown Accuracy: " + (numUnknownWordsCorrect / numUnknownWords) + ") Decoder Suboptimalities Detected: " + numDecodingInversions); }
public static List<String> toTagList(List<State> states) { List<String> tags = new ArrayList<String>(); if (states.size() > 0) { tags.add(states.get(0).getPreviousPreviousTag()); for (State state : states) { tags.add(state.getPreviousTag()); } } return tags; }
public String toString() { StringBuilder sb = new StringBuilder(); for (int englishPosition = 0; englishPosition < englishWords.size(); englishPosition++) { String englishWord = englishWords.get(englishPosition); sb.append(englishPosition); sb.append(":"); sb.append(englishWord); sb.append(" "); } sb.append("\n"); for (int frenchPosition = 0; frenchPosition < frenchWords.size(); frenchPosition++) { String frenchWord = frenchWords.get(frenchPosition); sb.append(frenchPosition); sb.append(":"); sb.append(frenchWord); sb.append(" "); } sb.append("\n"); return sb.toString(); }
// pretty-print a pair of taggings for a sentence, possibly suppressing the tags which correctly // match private static String alignedTaggings( List<String> words, List<String> goldTags, List<String> guessedTags, boolean suppressCorrectTags) { StringBuilder goldSB = new StringBuilder("Gold Tags: "); StringBuilder guessedSB = new StringBuilder("Guessed Tags: "); StringBuilder wordSB = new StringBuilder("Words: "); for (int position = 0; position < words.size(); position++) { equalizeLengths(wordSB, goldSB, guessedSB); String word = words.get(position); String gold = goldTags.get(position); String guessed = guessedTags.get(position); wordSB.append(word); if (position < words.size() - 1) wordSB.append(' '); boolean correct = (gold.equals(guessed)); if (correct && suppressCorrectTags) continue; guessedSB.append(guessed); goldSB.append(gold); } return goldSB + "\n" + guessedSB + "\n" + wordSB; }
public String getCurrentWord() { return words.get(position); }
private CounterMap<String,String> trainEM(int maxIterations) { Set<String> englishVocab = new HashSet<String>(); Set<String> frenchVocab = new HashSet<String>(); CounterMap<String,String> translations = new CounterMap<String,String>(); englishVocab.add(NULL); int iteration = 0; final double thresholdProb = 0.0001; for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); // add words from list to vocabulary sets englishVocab.addAll(englishWords); frenchVocab.addAll(frenchWords); } System.out.println("Ready"); // We need to initialize translations.getCount(f,e) uniformly // t(f|e) summed over all e in {E + NULL} = 1 final double initialCount = 1.0 / englishVocab.size(); while(iteration < maxIterations) { CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e // E-step: loop over all sentences and update counts for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); Counter<String> sTotalF = new Counter<String>(); // compute normalization constant sTotalF for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // initialize and compute for English = NULL if (!translations.containsKey(f) && initialize) translations.setCount(f, NULL, initialCount); else if (!translations.containsKey(f)) translations.setCount(f, NULL, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, NULL)); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); if (!(translations.getCounter(f)).containsKey(e) && initialize) translations.setCount(f, e, initialCount); else if (!(translations.getCounter(f)).containsKey(e)) translations.setCount(f, e, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, e)); } } // collect counts in counts and totalEnglish for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // collect counts for English = NULL double count = translations.getCount(f, NULL) / sTotalF.getCount(f); counts.incrementCount(NULL, f, count); totalEnglish.incrementCount(NULL, count); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); count = translations.getCount(f, e) / sTotalF.getCount(f); counts.incrementCount(e, f, count); totalEnglish.incrementCount(e, count); } } } // end of E-step System.out.println("Completed E-step"); // M-step: update probabilities with counts from E-step and check for convergence iteration++; for (String e : counts.keySet()) {//englishVocab) { double normalizer = totalEnglish.getCount(e); for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) { // To speed implementation, we want to update translations only when count / normalizer > threshold double prob = counts.getCount(e, f) / normalizer; if (!initialize) { if (prob > thresholdProb) translations.setCount(f, e, prob); else (translations.getCounter(f)).removeKey(e); } else { translations.setCount(f, e, prob); } } } System.out.println("Completed iteration " + iteration); } // end of M-step System.out.println("Trained!"); return translations; }