public static void main(String[] args) throws Exception { // Parse command line flags and arguments Map<String, String> argMap = CommandLineUtils.simpleCommandLineParser(args); // Set up default parameters and settings String basePath = "."; boolean verbose = false; // Update defaults using command line specifications // The path to the assignment data if (argMap.containsKey("-path")) { basePath = argMap.get("-path"); } System.out.println("Using base path: " + basePath); // Whether or not to print the individual errors. if (argMap.containsKey("-verbose")) { verbose = true; } // Read in data System.out.print("Loading training sentences..."); List<TaggedSentence> trainTaggedSentences = readTaggedSentences(basePath + "/en-wsj-train.pos", true); Set<String> trainingVocabulary = extractVocabulary(trainTaggedSentences); System.out.println("done."); System.out.print("Loading in-domain dev sentences..."); List<TaggedSentence> devInTaggedSentences = readTaggedSentences(basePath + "/en-wsj-dev.pos", true); System.out.println("done."); System.out.print("Loading out-of-domain dev sentences..."); List<TaggedSentence> devOutTaggedSentences = readTaggedSentences(basePath + "/en-web-weblogs-dev.pos", true); System.out.println("done."); System.out.print("Loading out-of-domain blind test sentences..."); List<TaggedSentence> testSentences = readTaggedSentences(basePath + "/en-web-test.blind", false); System.out.println("done."); // Construct tagger components // TODO : improve on the MostFrequentTagScorer LocalTrigramScorer localTrigramScorer = new MostFrequentTagScorer(false); // TODO : improve on the GreedyDecoder TrellisDecoder<State> trellisDecoder = new GreedyDecoder<State>(); // Train tagger POSTagger posTagger = new POSTagger(localTrigramScorer, trellisDecoder); posTagger.train(trainTaggedSentences); // Optionally tune hyperparameters on dev data posTagger.validate(devInTaggedSentences); // Test tagger System.out.println("Evaluating on in-domain data:."); evaluateTagger(posTagger, devInTaggedSentences, trainingVocabulary, verbose); System.out.println("Evaluating on out-of-domain data:."); evaluateTagger(posTagger, devOutTaggedSentences, trainingVocabulary, verbose); labelTestSet(posTagger, testSentences, basePath + "/en-web-test.tagged"); }
private static void evaluateTagger( POSTagger posTagger, List<TaggedSentence> taggedSentences, Set<String> trainingVocabulary, boolean verbose) { double numTags = 0.0; double numTagsCorrect = 0.0; double numUnknownWords = 0.0; double numUnknownWordsCorrect = 0.0; int numDecodingInversions = 0; for (TaggedSentence taggedSentence : taggedSentences) { List<String> words = taggedSentence.getWords(); List<String> goldTags = taggedSentence.getTags(); List<String> guessedTags = posTagger.tag(words); for (int position = 0; position < words.size() - 1; position++) { String word = words.get(position); String goldTag = goldTags.get(position); String guessedTag = guessedTags.get(position); if (guessedTag.equals(goldTag)) numTagsCorrect += 1.0; numTags += 1.0; if (!trainingVocabulary.contains(word)) { if (guessedTag.equals(goldTag)) numUnknownWordsCorrect += 1.0; numUnknownWords += 1.0; } } double scoreOfGoldTagging = posTagger.scoreTagging(taggedSentence); double scoreOfGuessedTagging = posTagger.scoreTagging(new TaggedSentence(words, guessedTags)); if (scoreOfGoldTagging > scoreOfGuessedTagging) { numDecodingInversions++; if (verbose) System.out.println( "WARNING: Decoder suboptimality detected. Gold tagging has higher score than guessed tagging."); } if (verbose) System.out.println(alignedTaggings(words, goldTags, guessedTags, true) + "\n"); } System.out.println( "Tag Accuracy: " + (numTagsCorrect / numTags) + " (Unknown Accuracy: " + (numUnknownWordsCorrect / numUnknownWords) + ") Decoder Suboptimalities Detected: " + numDecodingInversions); }
private static void labelTestSet( POSTagger posTagger, List<TaggedSentence> testSentences, String path) throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for (TaggedSentence sentence : testSentences) { List<String> words = sentence.getWords(); List<String> guessedTags = posTagger.tag(words); for (int i = 0; i < words.size(); i++) { writer.write(words.get(i) + "\t" + guessedTags.get(i) + "\n"); } writer.write("\n"); } writer.close(); }