private static Map<Integer, Alignment> readAlignments(String fileName) { Map<Integer,Alignment> alignments = new HashMap<Integer, Alignment>(); try { BufferedReader in = new BufferedReader(new FileReader(fileName)); while (in.ready()) { String line = in.readLine(); String[] words = line.split("\\s+"); if (words.length != 4) throw new RuntimeException("Bad alignment file "+fileName+", bad line was "+line); Integer sentenceID = Integer.parseInt(words[0]); Integer englishPosition = Integer.parseInt(words[1])-1; Integer frenchPosition = Integer.parseInt(words[2])-1; String type = words[3]; Alignment alignment = alignments.get(sentenceID); if (alignment == null) { alignment = new Alignment(); alignments.put(sentenceID, alignment); } alignment.addAlignment(englishPosition, frenchPosition, type.equals("S")); } } catch (IOException e) { throw new RuntimeException(e); } return alignments; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); // Model 1 assumes all alignments are equally likely // So we can just take the argMax of t(f|e) to get the englishMaxPosition for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = -1; double maxTranslationProb = translationProbs.getCount(f, NULL); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double translationProb = translationProbs.getCount(f, e); if (translationProb > maxTranslationProb) { maxTranslationProb = translationProb; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxDice = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double dice = getDiceCoefficient(f,e); if (dice > maxDice) { maxDice = dice; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxConditionalProb = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e)); if (conditionalGivenEnglish > maxConditionalProb) { maxConditionalProb = conditionalGivenEnglish; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
public static String render(Alignment reference, Alignment proposed, SentencePair sentencePair) { StringBuilder sb = new StringBuilder(); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { boolean sure = reference.containsSureAlignment(englishPosition, frenchPosition); boolean possible = reference.containsPossibleAlignment(englishPosition, frenchPosition); char proposedChar = ' '; if (proposed.containsSureAlignment(englishPosition, frenchPosition)) proposedChar = '#'; if (sure) { sb.append('['); sb.append(proposedChar); sb.append(']'); } else { if (possible) { sb.append('('); sb.append(proposedChar); sb.append(')'); } else { sb.append(' '); sb.append(proposedChar); sb.append(' '); } } } sb.append("| "); sb.append(sentencePair.getFrenchWords().get(frenchPosition)); sb.append('\n'); } for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { sb.append("---"); } sb.append("'\n"); boolean printed = true; int index = 0; while (printed) { printed = false; StringBuilder lineSB = new StringBuilder(); for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { String englishWord = sentencePair.getEnglishWords().get(englishPosition); if (englishWord.length() > index) { printed = true; lineSB.append(' '); lineSB.append(englishWord.charAt(index)); lineSB.append(' '); } else { lineSB.append(" "); } } index += 1; if (printed) { sb.append(lineSB); sb.append('\n'); } } return sb.toString(); }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); int numFrenchWords = sentencePair.getFrenchWords().size(); int numEnglishWords = sentencePair.getEnglishWords().size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { int englishPosition = frenchPosition; if (englishPosition >= numEnglishWords) englishPosition = -1; alignment.addAlignment(englishPosition, frenchPosition, true); } return alignment; }
public static void main(String[] args) { try { String sp1 = "/home/caoc/alterAligner/SCORE_DROP/all_mtx/" + args[0] + ".mtx"; String sp2 = "/home/caoc/alterAligner/SCORE_DROP/all_mtx/" + args[1] + ".mtx"; System.out.println(args[0]); // String sp1 = // "/home/caoc/alterAligner/SCORE_DROP/all_fasta/" + args[0] ; // String sp2 = "/home/caoc/alterAligner/SCORE_DROP/all_fasta/" // + args[1] ; Class parserClass = Class.forName("alterAligner.sequenceProfileParsers.PSSMParser"); Class scoreFunctionClass = Class.forName("alterAligner.scoreFunctionStrategies.WeightedSumStrategy"); SequenceProfileParser aParser = (SequenceProfileParser) parserClass.newInstance(); ScoreFunctionStrategy aStrategy = (ScoreFunctionStrategy) scoreFunctionClass.newInstance(); Aligner aligner = new Aligner(aStrategy, (float) 10, (float) 0.5); SequenceProfile seqPro1 = aParser.parse(new File(sp1)); SequenceProfile seqPro2 = aParser.parse(new File(sp2)); Alignment alignment = aligner.smithWatermanAlign(seqPro1, seqPro2); BufferedWriter writer = new BufferedWriter( new FileWriter( "/home/caoc/alterAligner/SCORE_DROP/all_ws_alignment/" + args[0] + "__" + args[1] + ".ws.fasta")); writer.append( ">" + alignment.getName1() + "\t" + alignment.getStart1() + "\t" + alignment.getEnd1() + "\n"); writer.append(alignment.getSequence1AsString() + "\n"); writer.append( ">" + alignment.getName2() + "\t" + alignment.getStart2() + "\t" + alignment.getEnd2() + "\n"); writer.append(alignment.getSequence2AsString() + "\n"); writer.close(); } catch (FileNotFoundException e) { System.out.println("Failed to open file! " + e.getMessage()); } catch (IOException e) { System.out.println("Failed to read the file! " + e.getMessage()); } catch (ClassNotFoundException e) { System.out.println("Failed to find the class! " + e.getMessage()); } catch (Exception e) { System.out.println("Failed to due to do alignment! " + e.getMessage()); e.printStackTrace(); } }
private static void predict(WordAligner wordAligner, List<SentencePair> testSentencePairs, String path) throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for (SentencePair sentencePair : testSentencePairs) { Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { if (proposedAlignment.containsSureAlignment(englishPosition, frenchPosition)) { writer.write(frenchPosition + "-" + englishPosition + " "); } } } writer.write("\n"); } writer.close(); }
/** * Creates a MultipleAlignment from the given Alignment object. Alignment is roughly analogous to * MultipleAlignment, but it's an interface and it corresponds to a multiple alignment that has * been read from a data store somewhere (presumably a database, but perhaps also a file or other * source). Creating a MultipleAlignment object "around" it basically makes it useable -- * MultipleAlignment will read the Alignment object and its corresponding AlignBlock pieces, and * builds data structures so that the underlying multiple alignment can be accessed easily. * * @param a */ public MultipleAlignment(Alignment a) { this(); Collection<AlignBlock> blocks = a.getAlignBlocks(); for (AlignBlock block : blocks) { GappedAlignmentString gaps = new GappedAlignmentString(block.getBitString()); Genome g = block.getGenome(); String chrom = block.getChrom(); String name = String.format("%s_%s", g.getVersion(), chrom); addGappedAlignment(name, gaps); } }
private static void test(WordAligner wordAligner, List<SentencePair> testSentencePairs, Map<Integer, Alignment> testAlignments, boolean verbose) { int proposedSureCount = 0; int proposedPossibleCount = 0; int sureCount = 0; int proposedCount = 0; for (SentencePair sentencePair : testSentencePairs) { Alignment proposedAlignment = wordAligner.alignSentencePair(sentencePair); Alignment referenceAlignment = testAlignments.get(sentencePair.getSentenceID()); if (referenceAlignment == null) throw new RuntimeException("No reference alignment found for sentenceID "+sentencePair.getSentenceID()); if (verbose) System.out.println("Alignment:\n"+Alignment.render(referenceAlignment,proposedAlignment,sentencePair)); for (int frenchPosition = 0; frenchPosition < sentencePair.getFrenchWords().size(); frenchPosition++) { for (int englishPosition = 0; englishPosition < sentencePair.getEnglishWords().size(); englishPosition++) { boolean proposed = proposedAlignment.containsSureAlignment(englishPosition, frenchPosition); boolean sure = referenceAlignment.containsSureAlignment(englishPosition, frenchPosition); boolean possible = referenceAlignment.containsPossibleAlignment(englishPosition, frenchPosition); if (proposed && sure) proposedSureCount += 1; if (proposed && possible) proposedPossibleCount += 1; if (proposed) proposedCount += 1; if (sure) sureCount += 1; } } } System.out.println("Precision: "+proposedPossibleCount/(double)proposedCount); System.out.println("Recall: "+proposedSureCount/(double)sureCount); System.out.println("AER: "+(1.0-(proposedSureCount+proposedPossibleCount)/(double)(sureCount+proposedCount))); }