Пример #1
0
 /* Returns a smoothed estimate of P(word|tag) */
 public double scoreTagging(String word, String tag) {
   double p_tag = tagCounter.getCount(tag) / totalTokens;
   double c_word = wordCounter.getCount(word);
   double c_tag_and_word = wordToTagCounters.getCount(word, tag);
   if (c_word < 10) { // rare or unknown
     c_word += 1.0;
     c_tag_and_word += typeTagCounter.getCount(tag) / totalWordTypes;
   }
   double p_word = (1.0 + c_word) / (totalTokens + totalWordTypes);
   double p_tag_given_word = c_tag_and_word / c_word;
   return p_tag_given_word / p_tag * p_word;
 }
Пример #2
0
 /* A builds PCFG using the observed counts of binary and unary
  * productions in the training trees to estimate the probabilities
  * for those rules.
  */
 public Grammar(List<Tree<String>> trainTrees) {
   Counter<UnaryRule> unaryRuleCounter = new Counter<UnaryRule>();
   Counter<BinaryRule> binaryRuleCounter = new Counter<BinaryRule>();
   Counter<String> symbolCounter = new Counter<String>();
   for (Tree<String> trainTree : trainTrees) {
     tallyTree(trainTree, symbolCounter, unaryRuleCounter, binaryRuleCounter);
   }
   for (UnaryRule unaryRule : unaryRuleCounter.keySet()) {
     double unaryProbability =
         unaryRuleCounter.getCount(unaryRule) / symbolCounter.getCount(unaryRule.getParent());
     unaryRule.setScore(unaryProbability);
     addUnary(unaryRule);
   }
   for (BinaryRule binaryRule : binaryRuleCounter.keySet()) {
     double binaryProbability =
         binaryRuleCounter.getCount(binaryRule) / symbolCounter.getCount(binaryRule.getParent());
     binaryRule.setScore(binaryProbability);
     addBinary(binaryRule);
   }
 }