예제 #1
0
 private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts(
     List<TaggedSentence> taggedSentences) {
   List<LabeledLocalTrigramContext> localTrigramContexts =
       new ArrayList<LabeledLocalTrigramContext>();
   for (TaggedSentence taggedSentence : taggedSentences) {
     localTrigramContexts.addAll(extractLabeledLocalTrigramContexts(taggedSentence));
   }
   return localTrigramContexts;
 }
 private static List<SentencePair> readSentencePairs(String path, int maxSentencePairs) {
   List<SentencePair> sentencePairs = new ArrayList<SentencePair>();
   List<String> baseFileNames = getBaseFileNames(path);
   for (String baseFileName : baseFileNames) {
     if (sentencePairs.size() >= maxSentencePairs)
       continue;
     sentencePairs.addAll(readSentencePairs(baseFileName));
   }
   return sentencePairs;
 }
  public static void main(String[] args) throws IOException {
    // Parse command line flags and arguments
    Map<String,String> argMap = CommandLineUtils.simpleCommandLineParser(args);

    // Set up default parameters and settings
    String basePath = ".";
    int maxTrainingSentences = 0;
    int maxIterations = 20;
    boolean verbose = false;
    boolean initialize = false;
    String dataset = "mini";
    String model = "baseline";

    // Update defaults using command line specifications
    if (argMap.containsKey("-path")) {
      basePath = argMap.get("-path");
      System.out.println("Using base path: "+basePath);
    }
    if (argMap.containsKey("-sentences")) {
      maxTrainingSentences = Integer.parseInt(argMap.get("-sentences"));
      System.out.println("Using an additional "+maxTrainingSentences+" training sentences.");
    }
    if (argMap.containsKey("-data")) {
      dataset = argMap.get("-data");
      System.out.println("Running with data: "+dataset);
    } else {
      System.out.println("No data set specified.  Use -data [miniTest, validate].");
    }
    if (argMap.containsKey("-model")) {
      model = argMap.get("-model");
      System.out.println("Running with model: "+model);
    } else {
      System.out.println("No model specified.  Use -model modelname.");
    }
    if (argMap.containsKey("-verbose")) {
      verbose = true;
    }
    if (argMap.containsKey("-iterations")) {
    	maxIterations = Integer.parseInt(argMap.get("-iterations"));
    }
    if (argMap.containsKey("-initialize")) {
    	initialize = true;
    }

    // Read appropriate training and testing sets.
    List<SentencePair> trainingSentencePairs = new ArrayList<SentencePair>();
    if (! (dataset.equals("miniTest") || dataset.equals("mini")) && maxTrainingSentences > 0)
      trainingSentencePairs = readSentencePairs(basePath+"/training", maxTrainingSentences);
    List<SentencePair> testSentencePairs = new ArrayList<SentencePair>();
    Map<Integer,Alignment> testAlignments = new HashMap<Integer, Alignment>();
    if (dataset.equalsIgnoreCase("validate")) {
      testSentencePairs = readSentencePairs(basePath+"/trial", Integer.MAX_VALUE);
      testAlignments = readAlignments(basePath+"/trial/trial.wa");
    } else if (dataset.equals("miniTest") || dataset.equals("mini")) {
      testSentencePairs = readSentencePairs(basePath+"/mini", Integer.MAX_VALUE);
      testAlignments = readAlignments(basePath+"/mini/mini.wa");
    } else {
      throw new RuntimeException("Bad data set mode: "+ dataset+", use validate or miniTest.");
    }
    trainingSentencePairs.addAll(testSentencePairs);

    // Build model
    WordAligner wordAligner = null;
    if (model.equalsIgnoreCase("baseline")) {
      wordAligner = new BaselineWordAligner();
    }
    // TODO : build other alignment models
    else if (model.equalsIgnoreCase("heuristic")) {
    	wordAligner = new HeuristicWordAligner(trainingSentencePairs);
    }
    else if (model.equalsIgnoreCase("dice")) {
    	wordAligner = new DiceWordAligner(trainingSentencePairs);
    }
    else if (model.equalsIgnoreCase("ibm1") || model.equalsIgnoreCase("ibmModel1")) {
    	wordAligner = new IBMmodel1WordAligner(trainingSentencePairs, maxIterations, initialize);
    }
    else if (model.equalsIgnoreCase("ibm2") || model.equalsIgnoreCase("ibmModel2")) {
    	wordAligner = new IBMmodel2WordAligner(trainingSentencePairs, maxIterations, initialize);
    }

    // Test model
    test(wordAligner, testSentencePairs, testAlignments, verbose);
    
    // Generate file for submission //can comment out if not ready for submission
    testSentencePairs = readSentencePairs(basePath+"/test", Integer.MAX_VALUE);
    predict(wordAligner, testSentencePairs, basePath+"/"+model+".out");
  }