コード例 #1
0
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxDice = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double dice = getDiceCoefficient(f,e);
	    		  if (dice > maxDice) {
	    			  maxDice = dice;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
コード例 #2
0
 private static void test(WordAligner wordAligner, List<SentencePair> testSentencePairs, Map<Integer, Alignment> testAlignments, boolean verbose) {
   int proposedSureCount = 0;
   int proposedPossibleCount = 0;
   int sureCount = 0;
   int proposedCount = 0;
   for (SentencePair sentencePair : testSentencePairs) {
     Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair);
     Alignment referenceAlignment = testAlignments.get(sentencePair.getSentenceID());
     if (referenceAlignment == null)
       throw new RuntimeException("No reference alignment found for sentenceID "+sentencePair.getSentenceID());
     if (verbose) System.out.println("Alignment:\n"+Alignment.render(referenceAlignment,proposedAlignment,sentencePair));
     for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) {
       for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
         boolean proposed = proposedAlignment.containsSureAlignment(englishPosition, frenchPosition);
         boolean sure = referenceAlignment.containsSureAlignment(englishPosition, frenchPosition);
         boolean possible = referenceAlignment.containsPossibleAlignment(englishPosition, frenchPosition);
         if (proposed && sure) proposedSureCount += 1;
         if (proposed && possible) proposedPossibleCount += 1;
         if (proposed) proposedCount += 1;
         if (sure) sureCount += 1;
       }
     }
   }
   System.out.println("Precision: "+proposedPossibleCount/(double)proposedCount);
   System.out.println("Recall: "+proposedSureCount/(double)sureCount);
   System.out.println("AER: "+(1.0-(proposedSureCount+proposedPossibleCount)/(double)(sureCount+proposedCount)));
 }
コード例 #3
0
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
		  List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
		  // Model 1 assumes all alignments are equally likely
	      // So we can just take the argMax of t(f|e) to get the englishMaxPosition
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = -1;
	    	  double maxTranslationProb = translationProbs.getCount(f, NULL);
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double translationProb = translationProbs.getCount(f, e);
	    		  if (translationProb > maxTranslationProb) {
	    			  maxTranslationProb = translationProb;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
コード例 #4
0
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxConditionalProb = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e));
	    		  if (conditionalGivenEnglish > maxConditionalProb) {
	    			  maxConditionalProb = conditionalGivenEnglish;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
コード例 #5
0
 public Alignment alignSentencePair(SentencePair sentencePair) {
   Alignment alignment = new Alignment();
   int numFrenchWords = sentencePair.getFrenchWords().size();
   int numEnglishWords = sentencePair.getEnglishWords().size();
   for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
     int englishPosition = frenchPosition;
     if (englishPosition >= numEnglishWords)
       englishPosition = -1;
     alignment.addAlignment(englishPosition, frenchPosition, true);
   }
   return alignment;
 }
コード例 #6
0
  private static void predict(WordAligner wordAligner, List<SentencePair> testSentencePairs, String path) throws IOException {
	BufferedWriter writer = new BufferedWriter(new FileWriter(path));
    for (SentencePair sentencePair : testSentencePairs) {
      Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair);
      for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) {
        for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
          if (proposedAlignment.containsSureAlignment(englishPosition, frenchPosition)) {
        	writer.write(frenchPosition + "-" + englishPosition + " ");
          }
        }
      }
      writer.write("\n");
    }
    writer.close();
  }
コード例 #7
0
	  private void trainCounters() {
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
		      List<String> englishWords = sentencePair.getEnglishWords();
		      
		      //fCounts.incrementAll(frenchWords, 1.0); // won't affect the argMax
		      eCounts.incrementAll(englishWords, 1.0);
		      
		      for (String f: frenchWords) {
		    	  for (String e: englishWords)
		    		  collocationCounts.incrementCount(f, e, 1.0);
		      }
		  }
		  System.out.println("Trained!");
	  }
コード例 #8
0
	  private void trainCounters() {
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
		      List<String> englishWords = sentencePair.getEnglishWords();
		      Set<String> frenchSet = new HashSet<String>(frenchWords);
		      Set<String> englishSet = new HashSet<String>(englishWords);
		      
		      fCountSentences.incrementAll(frenchSet, 1.0); 
		      eCountSentences.incrementAll(englishSet, 1.0);
		      
		      for (String f: frenchSet) {
		    	  for (String e: englishSet)
		    		  collocationCountSentences.incrementCount(f, e, 1.0);
		      }
		  }
		  System.out.println("Trained!");
	  }
コード例 #9
0
 public static String render(Alignment reference, Alignment proposed, SentencePair sentencePair) {
   StringBuilder sb = new StringBuilder();
   for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) {
     for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
       boolean sure = reference.containsSureAlignment(englishPosition, frenchPosition);
       boolean possible = reference.containsPossibleAlignment(englishPosition, frenchPosition);
       char proposedChar = ' ';
       if (proposed.containsSureAlignment(englishPosition, frenchPosition))
         proposedChar = '#';
       if (sure) {
         sb.append('[');
         sb.append(proposedChar);
         sb.append(']');
       } else {
         if (possible) {
           sb.append('(');
           sb.append(proposedChar);
           sb.append(')');
         } else {
           sb.append(' ');
           sb.append(proposedChar);
           sb.append(' ');
         }
       }
     }
     sb.append("| ");
     sb.append(sentencePair.getFrenchWords().get(frenchPosition));
     sb.append('\n');
   }
   for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
     sb.append("---");
   }
   sb.append("'\n");
   boolean printed = true;
   int index = 0;
   while (printed) {
     printed = false;
     StringBuilder lineSB = new StringBuilder();
     for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
       String englishWord = sentencePair.getEnglishWords().get(englishPosition);
       if (englishWord.length() > index) {
         printed = true;
         lineSB.append(' ');
         lineSB.append(englishWord.charAt(index));
         lineSB.append(' ');
       } else {
         lineSB.append("   ");
       }
     }
     index += 1;
     if (printed) {
       sb.append(lineSB);
       sb.append('\n');
     }
   }
   return sb.toString();
 }
コード例 #10
0
	  private CounterMap<String,String> trainEM(int maxIterations) {
		  Set<String> englishVocab = new HashSet<String>();
		  Set<String> frenchVocab = new HashSet<String>();
		  
		  CounterMap<String,String> translations = new CounterMap<String,String>();
		  englishVocab.add(NULL);
		  int iteration = 0;
		  final double thresholdProb = 0.0001;
		  
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
			  List<String> englishWords = sentencePair.getEnglishWords();
			  // add words from list to vocabulary sets
			  englishVocab.addAll(englishWords);
			  frenchVocab.addAll(frenchWords);
		  }
		  System.out.println("Ready");
		  
		  // We need to initialize translations.getCount(f,e) uniformly
		  // t(f|e) summed over all e in {E + NULL} = 1
		  final double initialCount = 1.0 / englishVocab.size();
		  
		  while(iteration < maxIterations) {
			  CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f
			  Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e
			  
			  // E-step: loop over all sentences and update counts
			  for (SentencePair sentencePair : trainingSentencePairs) {
				  List<String> frenchWords = sentencePair.getFrenchWords();
				  List<String> englishWords = sentencePair.getEnglishWords();
				  
			      int numFrenchWords = frenchWords.size();
			      int numEnglishWords = englishWords.size();
			      Counter<String> sTotalF = new Counter<String>(); 
			      
			      // compute normalization constant sTotalF
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  // initialize and compute for English = NULL
			    	  if (!translations.containsKey(f) && initialize)
			    		  translations.setCount(f, NULL, initialCount);
			    	  else if (!translations.containsKey(f))
			    		  translations.setCount(f, NULL, thresholdProb);
			    	  sTotalF.incrementCount(f, translations.getCount(f, NULL)); 
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  if (!(translations.getCounter(f)).containsKey(e) && initialize)
			    			  translations.setCount(f, e, initialCount);
			    		  else if (!(translations.getCounter(f)).containsKey(e))
			    			  translations.setCount(f, e, thresholdProb);
			    		  sTotalF.incrementCount(f, translations.getCount(f, e));
			    	  }
			      }
			      
			      // collect counts in counts and totalEnglish
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  
			    	  // collect counts for English = NULL
			    	  double count = translations.getCount(f, NULL) / sTotalF.getCount(f);
			    	  counts.incrementCount(NULL, f, count);
			    	  totalEnglish.incrementCount(NULL, count);
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  count = translations.getCount(f, e) / sTotalF.getCount(f);
			    		  counts.incrementCount(e, f, count);
			    		  totalEnglish.incrementCount(e, count);
			    	  }
			      }
			  } // end of E-step
			  System.out.println("Completed E-step");
			  
			  // M-step: update probabilities with counts from E-step and check for convergence
			  iteration++;
			  for (String e : counts.keySet()) {//englishVocab) {
				  double normalizer = totalEnglish.getCount(e);
				  for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) {
					  
					  // To speed implementation, we want to update translations only when count / normalizer > threshold
					  double prob = counts.getCount(e, f) / normalizer;
					  if (!initialize) {					  
						  if (prob > thresholdProb)
							  translations.setCount(f, e, prob);
						  else
							  (translations.getCounter(f)).removeKey(e);
					  }
					  else {
						  translations.setCount(f, e, prob);
					  }
				  }
			  }
			  System.out.println("Completed iteration " + iteration);
		  } // end of M-step
		  
		  System.out.println("Trained!");
		  return translations;
	  }