private static Map<Integer, Alignment> readAlignments(String fileName) { Map<Integer,Alignment> alignments = new HashMap<Integer, Alignment>(); try { BufferedReader in = new BufferedReader(new FileReader(fileName)); while (in.ready()) { String line = in.readLine(); String[] words = line.split("\\s+"); if (words.length != 4) throw new RuntimeException("Bad alignment file "+fileName+", bad line was "+line); Integer sentenceID = Integer.parseInt(words[0]); Integer englishPosition = Integer.parseInt(words[1])-1; Integer frenchPosition = Integer.parseInt(words[2])-1; String type = words[3]; Alignment alignment = alignments.get(sentenceID); if (alignment == null) { alignment = new Alignment(); alignments.put(sentenceID, alignment); } alignment.addAlignment(englishPosition, frenchPosition, type.equals("S")); } } catch (IOException e) { throw new RuntimeException(e); } return alignments; }
static Alignment getFeatureContaining(List<Alignment> features, int right) { int leftBounds = 0; int rightBounds = features.size() - 1; int idx = features.size() / 2; int lastIdx = -1; while (idx != lastIdx) { lastIdx = idx; Alignment f = features.get(idx); if (f.contains(right)) { return f; } if (f.getStart() > right) { rightBounds = idx; idx = (leftBounds + idx) / 2; } else { leftBounds = idx; idx = (rightBounds + idx) / 2; } } // Check the extremes if (features.get(0).contains(right)) { return features.get(0); } if (features.get(rightBounds).contains(right)) { return features.get(rightBounds); } return null; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); // Model 1 assumes all alignments are equally likely // So we can just take the argMax of t(f|e) to get the englishMaxPosition for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = -1; double maxTranslationProb = translationProbs.getCount(f, NULL); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double translationProb = translationProbs.getCount(f, e); if (translationProb > maxTranslationProb) { maxTranslationProb = translationProb; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxDice = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double dice = getDiceCoefficient(f,e); if (dice > maxDice) { maxDice = dice; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxConditionalProb = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e)); if (conditionalGivenEnglish > maxConditionalProb) { maxConditionalProb = conditionalGivenEnglish; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public static String render(Alignment reference, Alignment proposed, SentencePair sentencePair) { StringBuilder sb = new StringBuilder(); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { boolean sure = reference.containsSureAlignment(englishPosition, frenchPosition); boolean possible = reference.containsPossibleAlignment(englishPosition, frenchPosition); char proposedChar = ' '; if (proposed.containsSureAlignment(englishPosition, frenchPosition)) proposedChar = '#'; if (sure) { sb.append('['); sb.append(proposedChar); sb.append(']'); } else { if (possible) { sb.append('('); sb.append(proposedChar); sb.append(')'); } else { sb.append(' '); sb.append(proposedChar); sb.append(' '); } } } sb.append("| "); sb.append(sentencePair.getFrenchWords().get(frenchPosition)); sb.append('\n'); } for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { sb.append("---"); } sb.append("'\n"); boolean printed = true; int index = 0; while (printed) { printed = false; StringBuilder lineSB = new StringBuilder(); for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { String englishWord = sentencePair.getEnglishWords().get(englishPosition); if (englishWord.length() > index) { printed = true; lineSB.append(' '); lineSB.append(englishWord.charAt(index)); lineSB.append(' '); } else { lineSB.append(" "); } } index += 1; if (printed) { sb.append(lineSB); sb.append('\n'); } } return sb.toString(); }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); int numFrenchWords = sentencePair.getFrenchWords().size(); int numEnglishWords = sentencePair.getEnglishWords().size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { int englishPosition = frenchPosition; if (englishPosition >= numEnglishWords) englishPosition = -1; alignment.addAlignment(englishPosition, frenchPosition, true); } return alignment; }
private static void predict(WordAligner wordAligner, List<SentencePair> testSentencePairs, String path) throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for (SentencePair sentencePair : testSentencePairs) { Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { if (proposedAlignment.containsSureAlignment(englishPosition, frenchPosition)) { writer.write(frenchPosition + "-" + englishPosition + " "); } } } writer.write("\n"); } writer.close(); }
/** * Creates a MultipleAlignment from the given Alignment object. Alignment is roughly analogous to * MultipleAlignment, but it's an interface and it corresponds to a multiple alignment that has * been read from a data store somewhere (presumably a database, but perhaps also a file or other * source). Creating a MultipleAlignment object "around" it basically makes it useable -- * MultipleAlignment will read the Alignment object and its corresponding AlignBlock pieces, and * builds data structures so that the underlying multiple alignment can be accessed easily. * * @param a */ public MultipleAlignment(Alignment a) { this(); Collection<AlignBlock> blocks = a.getAlignBlocks(); for (AlignBlock block : blocks) { GappedAlignmentString gaps = new GappedAlignmentString(block.getBitString()); Genome g = block.getGenome(); String chrom = block.getChrom(); String name = String.format("%s_%s", g.getVersion(), chrom); addGappedAlignment(name, gaps); } }
private static void test(WordAligner wordAligner, List<SentencePair> testSentencePairs, Map<Integer, Alignment> testAlignments, boolean verbose) { int proposedSureCount = 0; int proposedPossibleCount = 0; int sureCount = 0; int proposedCount = 0; for (SentencePair sentencePair : testSentencePairs) { Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair); Alignment referenceAlignment = testAlignments.get(sentencePair.getSentenceID()); if (referenceAlignment == null) throw new RuntimeException("No reference alignment found for sentenceID "+sentencePair.getSentenceID()); if (verbose) System.out.println("Alignment:\n"+Alignment.render(referenceAlignment,proposedAlignment,sentencePair)); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { boolean proposed = proposedAlignment.containsSureAlignment(englishPosition, frenchPosition); boolean sure = referenceAlignment.containsSureAlignment(englishPosition, frenchPosition); boolean possible = referenceAlignment.containsPossibleAlignment(englishPosition, frenchPosition); if (proposed && sure) proposedSureCount += 1; if (proposed && possible) proposedPossibleCount += 1; if (proposed) proposedCount += 1; if (sure) sureCount += 1; } } } System.out.println("Precision: "+proposedPossibleCount/(double)proposedCount); System.out.println("Recall: "+proposedSureCount/(double)sureCount); System.out.println("AER: "+(1.0-(proposedSureCount+proposedPossibleCount)/(double)(sureCount+proposedCount))); }