private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts( List<TaggedSentence> taggedSentences) { List<LabeledLocalTrigramContext> localTrigramContexts = new ArrayList<LabeledLocalTrigramContext>(); for (TaggedSentence taggedSentence : taggedSentences) { localTrigramContexts.addAll(extractLabeledLocalTrigramContexts(taggedSentence)); } return localTrigramContexts; }
private static List<SentencePair> readSentencePairs(String path, int maxSentencePairs) { List<SentencePair> sentencePairs = new ArrayList<SentencePair>(); List<String> baseFileNames = getBaseFileNames(path); for (String baseFileName : baseFileNames) { if (sentencePairs.size() >= maxSentencePairs) continue; sentencePairs.addAll(readSentencePairs(baseFileName)); } return sentencePairs; }
public static void main(String[] args) throws IOException { // Parse command line flags and arguments Map<String,String> argMap = CommandLineUtils.simpleCommandLineParser(args); // Set up default parameters and settings String basePath = "."; int maxTrainingSentences = 0; int maxIterations = 20; boolean verbose = false; boolean initialize = false; String dataset = "mini"; String model = "baseline"; // Update defaults using command line specifications if (argMap.containsKey("-path")) { basePath = argMap.get("-path"); System.out.println("Using base path: "+basePath); } if (argMap.containsKey("-sentences")) { maxTrainingSentences = Integer.parseInt(argMap.get("-sentences")); System.out.println("Using an additional "+maxTrainingSentences+" training sentences."); } if (argMap.containsKey("-data")) { dataset = argMap.get("-data"); System.out.println("Running with data: "+dataset); } else { System.out.println("No data set specified. Use -data [miniTest, validate]."); } if (argMap.containsKey("-model")) { model = argMap.get("-model"); System.out.println("Running with model: "+model); } else { System.out.println("No model specified. Use -model modelname."); } if (argMap.containsKey("-verbose")) { verbose = true; } if (argMap.containsKey("-iterations")) { maxIterations = Integer.parseInt(argMap.get("-iterations")); } if (argMap.containsKey("-initialize")) { initialize = true; } // Read appropriate training and testing sets. List<SentencePair> trainingSentencePairs = new ArrayList<SentencePair>(); if (! (dataset.equals("miniTest") || dataset.equals("mini")) && maxTrainingSentences > 0) trainingSentencePairs = readSentencePairs(basePath+"/training", maxTrainingSentences); List<SentencePair> testSentencePairs = new ArrayList<SentencePair>(); Map<Integer,Alignment> testAlignments = new HashMap<Integer, Alignment>(); if (dataset.equalsIgnoreCase("validate")) { testSentencePairs = readSentencePairs(basePath+"/trial", Integer.MAX_VALUE); testAlignments = readAlignments(basePath+"/trial/trial.wa"); } else if (dataset.equals("miniTest") || dataset.equals("mini")) { testSentencePairs = readSentencePairs(basePath+"/mini", Integer.MAX_VALUE); testAlignments = readAlignments(basePath+"/mini/mini.wa"); } else { throw new RuntimeException("Bad data set mode: "+ dataset+", use validate or miniTest."); } trainingSentencePairs.addAll(testSentencePairs); // Build model WordAligner wordAligner = null; if (model.equalsIgnoreCase("baseline")) { wordAligner = new BaselineWordAligner(); } // TODO : build other alignment models else if (model.equalsIgnoreCase("heuristic")) { wordAligner = new HeuristicWordAligner(trainingSentencePairs); } else if (model.equalsIgnoreCase("dice")) { wordAligner = new DiceWordAligner(trainingSentencePairs); } else if (model.equalsIgnoreCase("ibm1") || model.equalsIgnoreCase("ibmModel1")) { wordAligner = new IBMmodel1WordAligner(trainingSentencePairs, maxIterations, initialize); } else if (model.equalsIgnoreCase("ibm2") || model.equalsIgnoreCase("ibmModel2")) { wordAligner = new IBMmodel2WordAligner(trainingSentencePairs, maxIterations, initialize); } // Test model test(wordAligner, testSentencePairs, testAlignments, verbose); // Generate file for submission //can comment out if not ready for submission testSentencePairs = readSentencePairs(basePath+"/test", Integer.MAX_VALUE); predict(wordAligner, testSentencePairs, basePath+"/"+model+".out"); }