public static List<String> toTagList(List<State> states) { List<String> tags = new ArrayList<String>(); if (states.size() > 0) { tags.add(states.get(0).getPreviousPreviousTag()); for (State state : states) { tags.add(state.getPreviousTag()); } } return tags; }
public List<S> getBestPath(Trellis<S> trellis) { List<S> states = new ArrayList<S>(); S currentState = trellis.getStartState(); states.add(currentState); while (!currentState.equals(trellis.getEndState())) { Counter<S> transitions = trellis.getForwardTransitions(currentState); S nextState = transitions.argMax(); states.add(nextState); currentState = nextState; } return states; }
private static List<TaggedSentence> readTaggedSentences(String path, boolean hasTags) throws Exception { List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>(); BufferedReader reader = new BufferedReader(new FileReader(path)); String line = ""; List<String> words = new LinkedList<String>(); List<String> tags = new LinkedList<String>(); while ((line = reader.readLine()) != null) { if (line.equals("")) { taggedSentences.add( new TaggedSentence( new BoundedList<String>(words, START_WORD, STOP_WORD), new BoundedList<String>(tags, START_WORD, STOP_WORD))); words = new LinkedList<String>(); tags = new LinkedList<String>(); } else { String[] fields = line.split("\\s+"); words.add(fields[0]); tags.add(hasTags ? fields[1] : ""); } } System.out.println("Read " + taggedSentences.size() + " sentences."); return taggedSentences; }
private static List<String> getBaseFileNames(String path) { List<File> englishFiles = IOUtils.getFilesUnder(path, new FileFilter() { public boolean accept(File pathname) { if (pathname.isDirectory()) return true; String name = pathname.getName(); return name.endsWith(ENGLISH_EXTENSION); } }); List<String> baseFileNames = new ArrayList<String>(); for (File englishFile : englishFiles) { String baseFileName = chop(englishFile.getAbsolutePath(), "."+ENGLISH_EXTENSION); baseFileNames.add(baseFileName); } return baseFileNames; }
private static Pair<Integer, List<String>> readSentence(String line) { int id = -1; List<String> words = new ArrayList<String>(); String[] tokens = line.split("\\s+"); for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; if (token.equals("<s")) continue; if (token.equals("</s>")) continue; if (token.startsWith("snum=")) { String idString = token.substring(5,token.length()-1); id = Integer.parseInt(idString); continue; } words.add(token.intern()); } return new Pair<Integer, List<String>>(id, words); }
private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts( TaggedSentence taggedSentence) { List<LabeledLocalTrigramContext> labeledLocalTrigramContexts = new ArrayList<LabeledLocalTrigramContext>(); List<String> words = new BoundedList<String>(taggedSentence.getWords(), START_WORD, STOP_WORD); List<String> tags = new BoundedList<String>(taggedSentence.getTags(), START_TAG, STOP_TAG); for (int position = 0; position <= taggedSentence.size() + 1; position++) { labeledLocalTrigramContexts.add( new LabeledLocalTrigramContext( words, position, tags.get(position - 2), tags.get(position - 1), tags.get(position))); } return labeledLocalTrigramContexts; }
private static List<SentencePair> readSentencePairs(String baseFileName) { List<SentencePair> sentencePairs = new ArrayList<SentencePair>(); String englishFileName = baseFileName + "." + ENGLISH_EXTENSION; String frenchFileName = baseFileName + "." + FRENCH_EXTENSION; try { BufferedReader englishIn = new BufferedReader(new FileReader(englishFileName)); //BufferedReader frenchIn = new BufferedReader(new FileReader(frenchFileName)); BufferedReader frenchIn = new BufferedReader(new InputStreamReader( new FileInputStream(frenchFileName), StandardCharsets.ISO_8859_1)); while (englishIn.ready() && frenchIn.ready()) { String englishLine = englishIn.readLine(); String frenchLine = frenchIn.readLine(); Pair<Integer,List<String>> englishSentenceAndID = readSentence(englishLine); Pair<Integer,List<String>> frenchSentenceAndID = readSentence(frenchLine); if (! englishSentenceAndID.getFirst().equals(frenchSentenceAndID.getFirst())) throw new RuntimeException("Sentence ID confusion in file "+baseFileName+", lines were:\n\t"+englishLine+"\n\t"+frenchLine); sentencePairs.add(new SentencePair(englishSentenceAndID.getFirst(), baseFileName, englishSentenceAndID.getSecond(), frenchSentenceAndID.getSecond())); } } catch (IOException e) { throw new RuntimeException(e); } return sentencePairs; }