/**
   * Main for tests.
   *
   * @param args
   * @throws RepositoryException
   */
  public static void main(String[] args) throws RepositoryException {

    ILexiconRepository lexRepository =
        new LexiconXMLRepository(new String[] {".\\bases\\lexicon.xml"});

    LexiconBase lexicon = new LexiconBase(lexRepository);

    POSTagger posTagger = new POSTagger(lexicon.getTrieVocabulary());

    TextTokenizer tTokenizer = new TextTokenizer();

    TokenizedText textTokenized =
        tTokenizer.tokenizeText(
            "Open a WMDRM content, asshole. Do not confirm the f*****g operation, m**********r.");

    for (TokenizedSentence tSentence : textTokenized.getSentenceList()) {
      Set<TaggedSentence> tagSents = posTagger.getTaggedSentences(tSentence);

      if (tagSents.size() > 0) {
        for (TaggedSentence ts : tagSents) {
          System.out.println(ts.toString());
        }
      } else {
        Set<String> suggestions = posTagger.getSuggestions(tSentence);
        System.out.print("Suggestions: ");
        String str = "";
        for (String sug : suggestions) {
          str = ", " + sug + str;
        }
        System.out.println(str.length() > 0 ? str.substring(2) : "");
      }
    }

    System.out.println("End!");
  }
Example #2
0
 private static Set<String> extractVocabulary(List<TaggedSentence> taggedSentences) {
   Set<String> vocabulary = new HashSet<String>();
   for (TaggedSentence taggedSentence : taggedSentences) {
     List<String> words = taggedSentence.getWords();
     vocabulary.addAll(words);
   }
   return vocabulary;
 }
Example #3
0
 private static void labelTestSet(
     POSTagger posTagger, List<TaggedSentence> testSentences, String path) throws Exception {
   BufferedWriter writer = new BufferedWriter(new FileWriter(path));
   for (TaggedSentence sentence : testSentences) {
     List<String> words = sentence.getWords();
     List<String> guessedTags = posTagger.tag(words);
     for (int i = 0; i < words.size(); i++) {
       writer.write(words.get(i) + "\t" + guessedTags.get(i) + "\n");
     }
     writer.write("\n");
   }
   writer.close();
 }
Example #4
0
 private List<LabeledLocalTrigramContext> extractLabeledLocalTrigramContexts(
     TaggedSentence taggedSentence) {
   List<LabeledLocalTrigramContext> labeledLocalTrigramContexts =
       new ArrayList<LabeledLocalTrigramContext>();
   List<String> words =
       new BoundedList<String>(taggedSentence.getWords(), START_WORD, STOP_WORD);
   List<String> tags = new BoundedList<String>(taggedSentence.getTags(), START_TAG, STOP_TAG);
   for (int position = 0; position <= taggedSentence.size() + 1; position++) {
     labeledLocalTrigramContexts.add(
         new LabeledLocalTrigramContext(
             words,
             position,
             tags.get(position - 2),
             tags.get(position - 1),
             tags.get(position)));
   }
   return labeledLocalTrigramContexts;
 }
Example #5
0
 private static void evaluateTagger(
     POSTagger posTagger,
     List<TaggedSentence> taggedSentences,
     Set<String> trainingVocabulary,
     boolean verbose) {
   double numTags = 0.0;
   double numTagsCorrect = 0.0;
   double numUnknownWords = 0.0;
   double numUnknownWordsCorrect = 0.0;
   int numDecodingInversions = 0;
   for (TaggedSentence taggedSentence : taggedSentences) {
     List<String> words = taggedSentence.getWords();
     List<String> goldTags = taggedSentence.getTags();
     List<String> guessedTags = posTagger.tag(words);
     for (int position = 0; position < words.size() - 1; position++) {
       String word = words.get(position);
       String goldTag = goldTags.get(position);
       String guessedTag = guessedTags.get(position);
       if (guessedTag.equals(goldTag)) numTagsCorrect += 1.0;
       numTags += 1.0;
       if (!trainingVocabulary.contains(word)) {
         if (guessedTag.equals(goldTag)) numUnknownWordsCorrect += 1.0;
         numUnknownWords += 1.0;
       }
     }
     double scoreOfGoldTagging = posTagger.scoreTagging(taggedSentence);
     double scoreOfGuessedTagging = posTagger.scoreTagging(new TaggedSentence(words, guessedTags));
     if (scoreOfGoldTagging > scoreOfGuessedTagging) {
       numDecodingInversions++;
       if (verbose)
         System.out.println(
             "WARNING: Decoder suboptimality detected.  Gold tagging has higher score than guessed tagging.");
     }
     if (verbose) System.out.println(alignedTaggings(words, goldTags, guessedTags, true) + "\n");
   }
   System.out.println(
       "Tag Accuracy: "
           + (numTagsCorrect / numTags)
           + " (Unknown Accuracy: "
           + (numUnknownWordsCorrect / numUnknownWords)
           + ")  Decoder Suboptimalities Detected: "
           + numDecodingInversions);
 }