private static Map<Integer, Alignment> readAlignments(String fileName) { Map<Integer,Alignment> alignments = new HashMap<Integer, Alignment>(); try { BufferedReader in = new BufferedReader(new FileReader(fileName)); while (in.ready()) { String line = in.readLine(); String[] words = line.split("\\s+"); if (words.length != 4) throw new RuntimeException("Bad alignment file "+fileName+", bad line was "+line); Integer sentenceID = Integer.parseInt(words[0]); Integer englishPosition = Integer.parseInt(words[1])-1; Integer frenchPosition = Integer.parseInt(words[2])-1; String type = words[3]; Alignment alignment = alignments.get(sentenceID); if (alignment == null) { alignment = new Alignment(); alignments.put(sentenceID, alignment); } alignment.addAlignment(englishPosition, frenchPosition, type.equals("S")); } } catch (IOException e) { throw new RuntimeException(e); } return alignments; }
private static List<SentencePair> readSentencePairs(String baseFileName) { List<SentencePair> sentencePairs = new ArrayList<SentencePair>(); String englishFileName = baseFileName + "." + ENGLISH_EXTENSION; String frenchFileName = baseFileName + "." + FRENCH_EXTENSION; try { BufferedReader englishIn = new BufferedReader(new FileReader(englishFileName)); //BufferedReader frenchIn = new BufferedReader(new FileReader(frenchFileName)); BufferedReader frenchIn = new BufferedReader(new InputStreamReader( new FileInputStream(frenchFileName), StandardCharsets.ISO_8859_1)); while (englishIn.ready() && frenchIn.ready()) { String englishLine = englishIn.readLine(); String frenchLine = frenchIn.readLine(); Pair<Integer,List<String>> englishSentenceAndID = readSentence(englishLine); Pair<Integer,List<String>> frenchSentenceAndID = readSentence(frenchLine); if (! englishSentenceAndID.getFirst().equals(frenchSentenceAndID.getFirst())) throw new RuntimeException("Sentence ID confusion in file "+baseFileName+", lines were:\n\t"+englishLine+"\n\t"+frenchLine); sentencePairs.add(new SentencePair(englishSentenceAndID.getFirst(), baseFileName, englishSentenceAndID.getSecond(), frenchSentenceAndID.getSecond())); } } catch (IOException e) { throw new RuntimeException(e); } return sentencePairs; }