private static Set<String> extractVocabulary(List<TaggedSentence> taggedSentences) { Set<String> vocabulary = new HashSet<String>(); for (TaggedSentence taggedSentence : taggedSentences) { List<String> words = taggedSentence.getWords(); vocabulary.addAll(words); } return vocabulary; }
private static void labelTestSet( POSTagger posTagger, List<TaggedSentence> testSentences, String path) throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for (TaggedSentence sentence : testSentences) { List<String> words = sentence.getWords(); List<String> guessedTags = posTagger.tag(words); for (int i = 0; i < words.size(); i++) { writer.write(words.get(i) + "\t" + guessedTags.get(i) + "\n"); } writer.write("\n"); } writer.close(); }
private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts( TaggedSentence taggedSentence) { List<LabeledLocalTrigramContext> labeledLocalTrigramContexts = new ArrayList<LabeledLocalTrigramContext>(); List<String> words = new BoundedList<String>(taggedSentence.getWords(), START_WORD, STOP_WORD); List<String> tags = new BoundedList<String>(taggedSentence.getTags(), START_TAG, STOP_TAG); for (int position = 0; position <= taggedSentence.size() + 1; position++) { labeledLocalTrigramContexts.add( new LabeledLocalTrigramContext( words, position, tags.get(position - 2), tags.get(position - 1), tags.get(position))); } return labeledLocalTrigramContexts; }
private static void evaluateTagger( POSTagger posTagger, List<TaggedSentence> taggedSentences, Set<String> trainingVocabulary, boolean verbose) { double numTags = 0.0; double numTagsCorrect = 0.0; double numUnknownWords = 0.0; double numUnknownWordsCorrect = 0.0; int numDecodingInversions = 0; for (TaggedSentence taggedSentence : taggedSentences) { List<String> words = taggedSentence.getWords(); List<String> goldTags = taggedSentence.getTags(); List<String> guessedTags = posTagger.tag(words); for (int position = 0; position < words.size() - 1; position++) { String word = words.get(position); String goldTag = goldTags.get(position); String guessedTag = guessedTags.get(position); if (guessedTag.equals(goldTag)) numTagsCorrect += 1.0; numTags += 1.0; if (!trainingVocabulary.contains(word)) { if (guessedTag.equals(goldTag)) numUnknownWordsCorrect += 1.0; numUnknownWords += 1.0; } } double scoreOfGoldTagging = posTagger.scoreTagging(taggedSentence); double scoreOfGuessedTagging = posTagger.scoreTagging(new TaggedSentence(words, guessedTags)); if (scoreOfGoldTagging > scoreOfGuessedTagging) { numDecodingInversions++; if (verbose) System.out.println( "WARNING: Decoder suboptimality detected. Gold tagging has higher score than guessed tagging."); } if (verbose) System.out.println(alignedTaggings(words, goldTags, guessedTags, true) + "\n"); } System.out.println( "Tag Accuracy: " + (numTagsCorrect / numTags) + " (Unknown Accuracy: " + (numUnknownWordsCorrect / numUnknownWords) + ") Decoder Suboptimalities Detected: " + numDecodingInversions); }