예제 #1
0
 public static List<String> toTagList(List<State> states) {
   List<String> tags = new ArrayList<String>();
   if (states.size() > 0) {
     tags.add(states.get(0).getPreviousPreviousTag());
     for (State state : states) {
       tags.add(state.getPreviousTag());
     }
   }
   return tags;
 }
예제 #2
0
 public List<S> getBestPath(Trellis<S> trellis) {
   List<S> states = new ArrayList<S>();
   S currentState = trellis.getStartState();
   states.add(currentState);
   while (!currentState.equals(trellis.getEndState())) {
     Counter<S> transitions = trellis.getForwardTransitions(currentState);
     S nextState = transitions.argMax();
     states.add(nextState);
     currentState = nextState;
   }
   return states;
 }
예제 #3
0
 private static List<TaggedSentence> readTaggedSentences(String path, boolean hasTags)
     throws Exception {
   List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
   BufferedReader reader = new BufferedReader(new FileReader(path));
   String line = "";
   List<String> words = new LinkedList<String>();
   List<String> tags = new LinkedList<String>();
   while ((line = reader.readLine()) != null) {
     if (line.equals("")) {
       taggedSentences.add(
           new TaggedSentence(
               new BoundedList<String>(words, START_WORD, STOP_WORD),
               new BoundedList<String>(tags, START_WORD, STOP_WORD)));
       words = new LinkedList<String>();
       tags = new LinkedList<String>();
     } else {
       String[] fields = line.split("\\s+");
       words.add(fields[0]);
       tags.add(hasTags ? fields[1] : "");
     }
   }
   System.out.println("Read " + taggedSentences.size() + " sentences.");
   return taggedSentences;
 }
 private static List<String> getBaseFileNames(String path) {
   List<File> englishFiles = IOUtils.getFilesUnder(path, new FileFilter() {
     public boolean accept(File pathname) {
       if (pathname.isDirectory())
         return true;
       String name = pathname.getName();
       return name.endsWith(ENGLISH_EXTENSION);
     }
   });
   List<String> baseFileNames = new ArrayList<String>();
   for (File englishFile : englishFiles) {
     String baseFileName = chop(englishFile.getAbsolutePath(), "."+ENGLISH_EXTENSION);
     baseFileNames.add(baseFileName);
   }
   return baseFileNames;
 }
 private static Pair<Integer, List<String>> readSentence(String line) {
   int id = -1;
   List<String> words = new ArrayList<String>();
   String[] tokens = line.split("\\s+");
   for (int i = 0; i < tokens.length; i++) {
     String token = tokens[i];
     if (token.equals("<s")) continue;
     if (token.equals("</s>")) continue;
     if (token.startsWith("snum=")) {
       String idString = token.substring(5,token.length()-1);
       id = Integer.parseInt(idString);
       continue;
     }
     words.add(token.intern());
   }
   return new Pair<Integer, List<String>>(id, words);
 }
예제 #6
0
 private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts(
     TaggedSentence taggedSentence) {
   List<LabeledLocalTrigramContext> labeledLocalTrigramContexts =
       new ArrayList<LabeledLocalTrigramContext>();
   List<String> words =
       new BoundedList<String>(taggedSentence.getWords(), START_WORD, STOP_WORD);
   List<String> tags = new BoundedList<String>(taggedSentence.getTags(), START_TAG, STOP_TAG);
   for (int position = 0; position <= taggedSentence.size() + 1; position++) {
     labeledLocalTrigramContexts.add(
         new LabeledLocalTrigramContext(
             words,
             position,
             tags.get(position - 2),
             tags.get(position - 1),
             tags.get(position)));
   }
   return labeledLocalTrigramContexts;
 }
 private static List<SentencePair> readSentencePairs(String baseFileName) {
   List<SentencePair> sentencePairs = new ArrayList<SentencePair>();
   String englishFileName = baseFileName + "." + ENGLISH_EXTENSION;
   String frenchFileName = baseFileName + "." + FRENCH_EXTENSION;
   try {
     BufferedReader englishIn = new BufferedReader(new FileReader(englishFileName));
     //BufferedReader frenchIn = new BufferedReader(new FileReader(frenchFileName));
     BufferedReader frenchIn = new BufferedReader(new InputStreamReader(
   		  new FileInputStream(frenchFileName), StandardCharsets.ISO_8859_1));
     while (englishIn.ready() && frenchIn.ready()) {
       String englishLine = englishIn.readLine();
       String frenchLine = frenchIn.readLine();
       Pair<Integer,List<String>> englishSentenceAndID = readSentence(englishLine);
       Pair<Integer,List<String>> frenchSentenceAndID = readSentence(frenchLine);
       if (! englishSentenceAndID.getFirst().equals(frenchSentenceAndID.getFirst()))
         throw new RuntimeException("Sentence ID confusion in file "+baseFileName+", lines were:\n\t"+englishLine+"\n\t"+frenchLine);
       sentencePairs.add(new SentencePair(englishSentenceAndID.getFirst(), baseFileName, englishSentenceAndID.getSecond(), frenchSentenceAndID.getSecond()));
     }
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
   return sentencePairs;
 }