public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
		  List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
		  // Model 1 assumes all alignments are equally likely
	      // So we can just take the argMax of t(f|e) to get the englishMaxPosition
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = -1;
	    	  double maxTranslationProb = translationProbs.getCount(f, NULL);
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double translationProb = translationProbs.getCount(f, e);
	    		  if (translationProb > maxTranslationProb) {
	    			  maxTranslationProb = translationProb;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
Beispiel #2
0
 /**
  * Builds a Trellis over a sentence, by starting at the state State, and advancing through all
  * legal extensions of each state already in the trellis. You should not have to modify this
  * code (or even read it, really).
  */
 private Trellis<State> buildTrellis(List<String> sentence) {
   Trellis<State> trellis = new Trellis<State>();
   trellis.setStartState(State.getStartState());
   State stopState = State.getStopState(sentence.size() + 2);
   trellis.setStopState(stopState);
   Set<State> states = Collections.singleton(State.getStartState());
   for (int position = 0; position <= sentence.size() + 1; position++) {
     Set<State> nextStates = new HashSet<State>();
     for (State state : states) {
       if (state.equals(stopState)) continue;
       LocalTrigramContext localTrigramContext =
           new LocalTrigramContext(
               sentence, position, state.getPreviousPreviousTag(), state.getPreviousTag());
       Counter<String> tagScores = localTrigramScorer.getLogScoreCounter(localTrigramContext);
       for (String tag : tagScores.keySet()) {
         double score = tagScores.getCount(tag);
         State nextState = state.getNextState(tag);
         trellis.setTransitionCount(state, nextState, score);
         nextStates.add(nextState);
       }
     }
     //        System.out.println("States: "+nextStates);
     states = nextStates;
   }
   return trellis;
 }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxDice = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double dice = getDiceCoefficient(f,e);
	    		  if (dice > maxDice) {
	    			  maxDice = dice;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxConditionalProb = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e));
	    		  if (conditionalGivenEnglish > maxConditionalProb) {
	    			  maxConditionalProb = conditionalGivenEnglish;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
Beispiel #5
0
 public static List<String> toTagList(List<State> states) {
   List<String> tags = new ArrayList<String>();
   if (states.size() > 0) {
     tags.add(states.get(0).getPreviousPreviousTag());
     for (State state : states) {
       tags.add(state.getPreviousTag());
     }
   }
   return tags;
 }
 private static List<SentencePair> readSentencePairs(String path, int maxSentencePairs) {
   List<SentencePair> sentencePairs = new ArrayList<SentencePair>();
   List<String> baseFileNames = getBaseFileNames(path);
   for (String baseFileName : baseFileNames) {
     if (sentencePairs.size() >= maxSentencePairs)
       continue;
     sentencePairs.addAll(readSentencePairs(baseFileName));
   }
   return sentencePairs;
 }
Beispiel #7
0
 public String toString() {
   StringBuilder sb = new StringBuilder();
   for (int position = 0; position < words.size(); position++) {
     String word = words.get(position);
     String tag = tags.get(position);
     sb.append(word);
     sb.append("_");
     sb.append(tag);
   }
   return sb.toString();
 }
 public String toString() {
   StringBuilder sb = new StringBuilder();
   for (int englishPosition = 0; englishPosition < englishWords.size(); englishPosition++) {
     String englishWord = englishWords.get(englishPosition);
     sb.append(englishPosition);
     sb.append(":");
     sb.append(englishWord);
     sb.append(" ");
   }
   sb.append("\n");
   for (int frenchPosition = 0; frenchPosition < frenchWords.size(); frenchPosition++) {
     String frenchWord = frenchWords.get(frenchPosition);
     sb.append(frenchPosition);
     sb.append(":");
     sb.append(frenchWord);
     sb.append(" ");
   }
   sb.append("\n");
   return sb.toString();
 }
Beispiel #9
0
 // pretty-print a pair of taggings for a sentence, possibly suppressing the tags which correctly
 // match
 private static String alignedTaggings(
     List<String> words,
     List<String> goldTags,
     List<String> guessedTags,
     boolean suppressCorrectTags) {
   StringBuilder goldSB = new StringBuilder("Gold Tags: ");
   StringBuilder guessedSB = new StringBuilder("Guessed Tags: ");
   StringBuilder wordSB = new StringBuilder("Words: ");
   for (int position = 0; position < words.size(); position++) {
     equalizeLengths(wordSB, goldSB, guessedSB);
     String word = words.get(position);
     String gold = goldTags.get(position);
     String guessed = guessedTags.get(position);
     wordSB.append(word);
     if (position < words.size() - 1) wordSB.append(' ');
     boolean correct = (gold.equals(guessed));
     if (correct && suppressCorrectTags) continue;
     guessedSB.append(guessed);
     goldSB.append(gold);
   }
   return goldSB + "\n" + guessedSB + "\n" + wordSB;
 }
Beispiel #10
0
 private static void labelTestSet(
     POSTagger posTagger, List<TaggedSentence> testSentences, String path) throws Exception {
   BufferedWriter writer = new BufferedWriter(new FileWriter(path));
   for (TaggedSentence sentence : testSentences) {
     List<String> words = sentence.getWords();
     List<String> guessedTags = posTagger.tag(words);
     for (int i = 0; i < words.size(); i++) {
       writer.write(words.get(i) + "\t" + guessedTags.get(i) + "\n");
     }
     writer.write("\n");
   }
   writer.close();
 }
Beispiel #11
0
 private static void evaluateTagger(
     POSTagger posTagger,
     List<TaggedSentence> taggedSentences,
     Set<String> trainingVocabulary,
     boolean verbose) {
   double numTags = 0.0;
   double numTagsCorrect = 0.0;
   double numUnknownWords = 0.0;
   double numUnknownWordsCorrect = 0.0;
   int numDecodingInversions = 0;
   for (TaggedSentence taggedSentence : taggedSentences) {
     List<String> words = taggedSentence.getWords();
     List<String> goldTags = taggedSentence.getTags();
     List<String> guessedTags = posTagger.tag(words);
     for (int position = 0; position < words.size() - 1; position++) {
       String word = words.get(position);
       String goldTag = goldTags.get(position);
       String guessedTag = guessedTags.get(position);
       if (guessedTag.equals(goldTag)) numTagsCorrect += 1.0;
       numTags += 1.0;
       if (!trainingVocabulary.contains(word)) {
         if (guessedTag.equals(goldTag)) numUnknownWordsCorrect += 1.0;
         numUnknownWords += 1.0;
       }
     }
     double scoreOfGoldTagging = posTagger.scoreTagging(taggedSentence);
     double scoreOfGuessedTagging = posTagger.scoreTagging(new TaggedSentence(words, guessedTags));
     if (scoreOfGoldTagging > scoreOfGuessedTagging) {
       numDecodingInversions++;
       if (verbose)
         System.out.println(
             "WARNING: Decoder suboptimality detected.  Gold tagging has higher score than guessed tagging.");
     }
     if (verbose) System.out.println(alignedTaggings(words, goldTags, guessedTags, true) + "\n");
   }
   System.out.println(
       "Tag Accuracy: "
           + (numTagsCorrect / numTags)
           + " (Unknown Accuracy: "
           + (numUnknownWordsCorrect / numUnknownWords)
           + ")  Decoder Suboptimalities Detected: "
           + numDecodingInversions);
 }
Beispiel #12
0
 private static List<TaggedSentence> readTaggedSentences(String path, boolean hasTags)
     throws Exception {
   List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
   BufferedReader reader = new BufferedReader(new FileReader(path));
   String line = "";
   List<String> words = new LinkedList<String>();
   List<String> tags = new LinkedList<String>();
   while ((line = reader.readLine()) != null) {
     if (line.equals("")) {
       taggedSentences.add(
           new TaggedSentence(
               new BoundedList<String>(words, START_WORD, STOP_WORD),
               new BoundedList<String>(tags, START_WORD, STOP_WORD)));
       words = new LinkedList<String>();
       tags = new LinkedList<String>();
     } else {
       String[] fields = line.split("\\s+");
       words.add(fields[0]);
       tags.add(hasTags ? fields[1] : "");
     }
   }
   System.out.println("Read " + taggedSentences.size() + " sentences.");
   return taggedSentences;
 }
Beispiel #13
0
 private List<String> stripBoundaryTags(List<String> tags) {
   return tags.subList(2, tags.size() - 2);
 }
Beispiel #14
0
 public int size() {
   return words.size();
 }
	  private CounterMap<String,String> trainEM(int maxIterations) {
		  Set<String> englishVocab = new HashSet<String>();
		  Set<String> frenchVocab = new HashSet<String>();
		  
		  CounterMap<String,String> translations = new CounterMap<String,String>();
		  englishVocab.add(NULL);
		  int iteration = 0;
		  final double thresholdProb = 0.0001;
		  
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
			  List<String> englishWords = sentencePair.getEnglishWords();
			  // add words from list to vocabulary sets
			  englishVocab.addAll(englishWords);
			  frenchVocab.addAll(frenchWords);
		  }
		  System.out.println("Ready");
		  
		  // We need to initialize translations.getCount(f,e) uniformly
		  // t(f|e) summed over all e in {E + NULL} = 1
		  final double initialCount = 1.0 / englishVocab.size();
		  
		  while(iteration < maxIterations) {
			  CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f
			  Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e
			  
			  // E-step: loop over all sentences and update counts
			  for (SentencePair sentencePair : trainingSentencePairs) {
				  List<String> frenchWords = sentencePair.getFrenchWords();
				  List<String> englishWords = sentencePair.getEnglishWords();
				  
			      int numFrenchWords = frenchWords.size();
			      int numEnglishWords = englishWords.size();
			      Counter<String> sTotalF = new Counter<String>(); 
			      
			      // compute normalization constant sTotalF
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  // initialize and compute for English = NULL
			    	  if (!translations.containsKey(f) && initialize)
			    		  translations.setCount(f, NULL, initialCount);
			    	  else if (!translations.containsKey(f))
			    		  translations.setCount(f, NULL, thresholdProb);
			    	  sTotalF.incrementCount(f, translations.getCount(f, NULL)); 
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  if (!(translations.getCounter(f)).containsKey(e) && initialize)
			    			  translations.setCount(f, e, initialCount);
			    		  else if (!(translations.getCounter(f)).containsKey(e))
			    			  translations.setCount(f, e, thresholdProb);
			    		  sTotalF.incrementCount(f, translations.getCount(f, e));
			    	  }
			      }
			      
			      // collect counts in counts and totalEnglish
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  
			    	  // collect counts for English = NULL
			    	  double count = translations.getCount(f, NULL) / sTotalF.getCount(f);
			    	  counts.incrementCount(NULL, f, count);
			    	  totalEnglish.incrementCount(NULL, count);
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  count = translations.getCount(f, e) / sTotalF.getCount(f);
			    		  counts.incrementCount(e, f, count);
			    		  totalEnglish.incrementCount(e, count);
			    	  }
			      }
			  } // end of E-step
			  System.out.println("Completed E-step");
			  
			  // M-step: update probabilities with counts from E-step and check for convergence
			  iteration++;
			  for (String e : counts.keySet()) {//englishVocab) {
				  double normalizer = totalEnglish.getCount(e);
				  for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) {
					  
					  // To speed implementation, we want to update translations only when count / normalizer > threshold
					  double prob = counts.getCount(e, f) / normalizer;
					  if (!initialize) {					  
						  if (prob > thresholdProb)
							  translations.setCount(f, e, prob);
						  else
							  (translations.getCounter(f)).removeKey(e);
					  }
					  else {
						  translations.setCount(f, e, prob);
					  }
				  }
			  }
			  System.out.println("Completed iteration " + iteration);
		  } // end of M-step
		  
		  System.out.println("Trained!");
		  return translations;
	  }