private static Map<Integer, Alignment> readAlignments(String fileName) {
   Map<Integer,Alignment> alignments = new HashMap<Integer, Alignment>();
   try {
     BufferedReader in = new BufferedReader(new FileReader(fileName));
     while (in.ready()) {
       String line = in.readLine();
       String[] words = line.split("\\s+");
       if (words.length != 4)
         throw new RuntimeException("Bad alignment file "+fileName+", bad line was "+line);
       Integer sentenceID = Integer.parseInt(words[0]);
       Integer englishPosition = Integer.parseInt(words[1])-1;
       Integer frenchPosition = Integer.parseInt(words[2])-1;
       String type = words[3];
       Alignment alignment = alignments.get(sentenceID);
       if (alignment == null) {
         alignment = new Alignment();
         alignments.put(sentenceID, alignment);
       }
       alignment.addAlignment(englishPosition, frenchPosition, type.equals("S"));
     }
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
   return alignments;
 }
Пример #2
0
  static Alignment getFeatureContaining(List<Alignment> features, int right) {

    int leftBounds = 0;
    int rightBounds = features.size() - 1;
    int idx = features.size() / 2;
    int lastIdx = -1;

    while (idx != lastIdx) {
      lastIdx = idx;
      Alignment f = features.get(idx);
      if (f.contains(right)) {
        return f;
      }

      if (f.getStart() > right) {
        rightBounds = idx;
        idx = (leftBounds + idx) / 2;
      } else {
        leftBounds = idx;
        idx = (rightBounds + idx) / 2;
      }
    }
    // Check the extremes
    if (features.get(0).contains(right)) {
      return features.get(0);
    }

    if (features.get(rightBounds).contains(right)) {
      return features.get(rightBounds);
    }

    return null;
  }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
		  List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
		  // Model 1 assumes all alignments are equally likely
	      // So we can just take the argMax of t(f|e) to get the englishMaxPosition
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = -1;
	    	  double maxTranslationProb = translationProbs.getCount(f, NULL);
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double translationProb = translationProbs.getCount(f, e);
	    		  if (translationProb > maxTranslationProb) {
	    			  maxTranslationProb = translationProb;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxDice = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double dice = getDiceCoefficient(f,e);
	    		  if (dice > maxDice) {
	    			  maxDice = dice;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxConditionalProb = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e));
	    		  if (conditionalGivenEnglish > maxConditionalProb) {
	    			  maxConditionalProb = conditionalGivenEnglish;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
 public static String render(Alignment reference, Alignment proposed, SentencePair sentencePair) {
   StringBuilder sb = new StringBuilder();
   for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) {
     for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
       boolean sure = reference.containsSureAlignment(englishPosition, frenchPosition);
       boolean possible = reference.containsPossibleAlignment(englishPosition, frenchPosition);
       char proposedChar = ' ';
       if (proposed.containsSureAlignment(englishPosition, frenchPosition))
         proposedChar = '#';
       if (sure) {
         sb.append('[');
         sb.append(proposedChar);
         sb.append(']');
       } else {
         if (possible) {
           sb.append('(');
           sb.append(proposedChar);
           sb.append(')');
         } else {
           sb.append(' ');
           sb.append(proposedChar);
           sb.append(' ');
         }
       }
     }
     sb.append("| ");
     sb.append(sentencePair.getFrenchWords().get(frenchPosition));
     sb.append('\n');
   }
   for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
     sb.append("---");
   }
   sb.append("'\n");
   boolean printed = true;
   int index = 0;
   while (printed) {
     printed = false;
     StringBuilder lineSB = new StringBuilder();
     for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
       String englishWord = sentencePair.getEnglishWords().get(englishPosition);
       if (englishWord.length() > index) {
         printed = true;
         lineSB.append(' ');
         lineSB.append(englishWord.charAt(index));
         lineSB.append(' ');
       } else {
         lineSB.append("   ");
       }
     }
     index += 1;
     if (printed) {
       sb.append(lineSB);
       sb.append('\n');
     }
   }
   return sb.toString();
 }
 public Alignment alignSentencePair(SentencePair sentencePair) {
   Alignment alignment = new Alignment();
   int numFrenchWords = sentencePair.getFrenchWords().size();
   int numEnglishWords = sentencePair.getEnglishWords().size();
   for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
     int englishPosition = frenchPosition;
     if (englishPosition >= numEnglishWords)
       englishPosition = -1;
     alignment.addAlignment(englishPosition, frenchPosition, true);
   }
   return alignment;
 }
  private static void predict(WordAligner wordAligner, List<SentencePair> testSentencePairs, String path) throws IOException {
	BufferedWriter writer = new BufferedWriter(new FileWriter(path));
    for (SentencePair sentencePair : testSentencePairs) {
      Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair);
      for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) {
        for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
          if (proposedAlignment.containsSureAlignment(englishPosition, frenchPosition)) {
        	writer.write(frenchPosition + "-" + englishPosition + " ");
          }
        }
      }
      writer.write("\n");
    }
    writer.close();
  }
Пример #9
0
 /**
  * Creates a MultipleAlignment from the given Alignment object. Alignment is roughly analogous to
  * MultipleAlignment, but it's an interface and it corresponds to a multiple alignment that has
  * been read from a data store somewhere (presumably a database, but perhaps also a file or other
  * source). Creating a MultipleAlignment object "around" it basically makes it useable --
  * MultipleAlignment will read the Alignment object and its corresponding AlignBlock pieces, and
  * builds data structures so that the underlying multiple alignment can be accessed easily.
  *
  * @param a
  */
 public MultipleAlignment(Alignment a) {
   this();
   Collection<AlignBlock> blocks = a.getAlignBlocks();
   for (AlignBlock block : blocks) {
     GappedAlignmentString gaps = new GappedAlignmentString(block.getBitString());
     Genome g = block.getGenome();
     String chrom = block.getChrom();
     String name = String.format("%s_%s", g.getVersion(), chrom);
     addGappedAlignment(name, gaps);
   }
 }
 private static void test(WordAligner wordAligner, List<SentencePair> testSentencePairs, Map<Integer, Alignment> testAlignments, boolean verbose) {
   int proposedSureCount = 0;
   int proposedPossibleCount = 0;
   int sureCount = 0;
   int proposedCount = 0;
   for (SentencePair sentencePair : testSentencePairs) {
     Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair);
     Alignment referenceAlignment = testAlignments.get(sentencePair.getSentenceID());
     if (referenceAlignment == null)
       throw new RuntimeException("No reference alignment found for sentenceID "+sentencePair.getSentenceID());
     if (verbose) System.out.println("Alignment:\n"+Alignment.render(referenceAlignment,proposedAlignment,sentencePair));
     for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) {
       for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) {
         boolean proposed = proposedAlignment.containsSureAlignment(englishPosition, frenchPosition);
         boolean sure = referenceAlignment.containsSureAlignment(englishPosition, frenchPosition);
         boolean possible = referenceAlignment.containsPossibleAlignment(englishPosition, frenchPosition);
         if (proposed && sure) proposedSureCount += 1;
         if (proposed && possible) proposedPossibleCount += 1;
         if (proposed) proposedCount += 1;
         if (sure) sureCount += 1;
       }
     }
   }
   System.out.println("Precision: "+proposedPossibleCount/(double)proposedCount);
   System.out.println("Recall: "+proposedSureCount/(double)sureCount);
   System.out.println("AER: "+(1.0-(proposedSureCount+proposedPossibleCount)/(double)(sureCount+proposedCount)));
 }