Пример #1
0
  public static void main(String[] args) throws Exception {
    // Parse command line flags and arguments
    Map<String, String> argMap = CommandLineUtils.simpleCommandLineParser(args);

    // Set up default parameters and settings
    String basePath = ".";
    boolean verbose = false;

    // Update defaults using command line specifications

    // The path to the assignment data
    if (argMap.containsKey("-path")) {
      basePath = argMap.get("-path");
    }
    System.out.println("Using base path: " + basePath);

    // Whether or not to print the individual errors.
    if (argMap.containsKey("-verbose")) {
      verbose = true;
    }

    // Read in data
    System.out.print("Loading training sentences...");
    List<TaggedSentence> trainTaggedSentences =
        readTaggedSentences(basePath + "/en-wsj-train.pos", true);
    Set<String> trainingVocabulary = extractVocabulary(trainTaggedSentences);
    System.out.println("done.");
    System.out.print("Loading in-domain dev sentences...");
    List<TaggedSentence> devInTaggedSentences =
        readTaggedSentences(basePath + "/en-wsj-dev.pos", true);
    System.out.println("done.");
    System.out.print("Loading out-of-domain dev sentences...");
    List<TaggedSentence> devOutTaggedSentences =
        readTaggedSentences(basePath + "/en-web-weblogs-dev.pos", true);
    System.out.println("done.");
    System.out.print("Loading out-of-domain blind test sentences...");
    List<TaggedSentence> testSentences =
        readTaggedSentences(basePath + "/en-web-test.blind", false);
    System.out.println("done.");

    // Construct tagger components
    // TODO : improve on the MostFrequentTagScorer
    LocalTrigramScorer localTrigramScorer = new MostFrequentTagScorer(false);
    // TODO : improve on the GreedyDecoder
    TrellisDecoder<State> trellisDecoder = new GreedyDecoder<State>();

    // Train tagger
    POSTagger posTagger = new POSTagger(localTrigramScorer, trellisDecoder);
    posTagger.train(trainTaggedSentences);

    // Optionally tune hyperparameters on dev data
    posTagger.validate(devInTaggedSentences);

    // Test tagger
    System.out.println("Evaluating on in-domain data:.");
    evaluateTagger(posTagger, devInTaggedSentences, trainingVocabulary, verbose);
    System.out.println("Evaluating on out-of-domain data:.");
    evaluateTagger(posTagger, devOutTaggedSentences, trainingVocabulary, verbose);
    labelTestSet(posTagger, testSentences, basePath + "/en-web-test.tagged");
  }
Пример #2
0
 private static void evaluateTagger(
     POSTagger posTagger,
     List<TaggedSentence> taggedSentences,
     Set<String> trainingVocabulary,
     boolean verbose) {
   double numTags = 0.0;
   double numTagsCorrect = 0.0;
   double numUnknownWords = 0.0;
   double numUnknownWordsCorrect = 0.0;
   int numDecodingInversions = 0;
   for (TaggedSentence taggedSentence : taggedSentences) {
     List<String> words = taggedSentence.getWords();
     List<String> goldTags = taggedSentence.getTags();
     List<String> guessedTags = posTagger.tag(words);
     for (int position = 0; position < words.size() - 1; position++) {
       String word = words.get(position);
       String goldTag = goldTags.get(position);
       String guessedTag = guessedTags.get(position);
       if (guessedTag.equals(goldTag)) numTagsCorrect += 1.0;
       numTags += 1.0;
       if (!trainingVocabulary.contains(word)) {
         if (guessedTag.equals(goldTag)) numUnknownWordsCorrect += 1.0;
         numUnknownWords += 1.0;
       }
     }
     double scoreOfGoldTagging = posTagger.scoreTagging(taggedSentence);
     double scoreOfGuessedTagging = posTagger.scoreTagging(new TaggedSentence(words, guessedTags));
     if (scoreOfGoldTagging > scoreOfGuessedTagging) {
       numDecodingInversions++;
       if (verbose)
         System.out.println(
             "WARNING: Decoder suboptimality detected.  Gold tagging has higher score than guessed tagging.");
     }
     if (verbose) System.out.println(alignedTaggings(words, goldTags, guessedTags, true) + "\n");
   }
   System.out.println(
       "Tag Accuracy: "
           + (numTagsCorrect / numTags)
           + " (Unknown Accuracy: "
           + (numUnknownWordsCorrect / numUnknownWords)
           + ")  Decoder Suboptimalities Detected: "
           + numDecodingInversions);
 }
Пример #3
0
 private static void labelTestSet(
     POSTagger posTagger, List<TaggedSentence> testSentences, String path) throws Exception {
   BufferedWriter writer = new BufferedWriter(new FileWriter(path));
   for (TaggedSentence sentence : testSentences) {
     List<String> words = sentence.getWords();
     List<String> guessedTags = posTagger.tag(words);
     for (int i = 0; i < words.size(); i++) {
       writer.write(words.get(i) + "\t" + guessedTags.get(i) + "\n");
     }
     writer.write("\n");
   }
   writer.close();
 }