/* Returns a smoothed estimate of P(word|tag) */ public double scoreTagging(String word, String tag) { double p_tag = tagCounter.getCount(tag) / totalTokens; double c_word = wordCounter.getCount(word); double c_tag_and_word = wordToTagCounters.getCount(word, tag); if (c_word < 10) { // rare or unknown c_word += 1.0; c_tag_and_word += typeTagCounter.getCount(tag) / totalWordTypes; } double p_word = (1.0 + c_word) / (totalTokens + totalWordTypes); double p_tag_given_word = c_tag_and_word / c_word; return p_tag_given_word / p_tag * p_word; }
/* A builds PCFG using the observed counts of binary and unary * productions in the training trees to estimate the probabilities * for those rules. */ public Grammar(List<Tree<String>> trainTrees) { Counter<UnaryRule> unaryRuleCounter = new Counter<UnaryRule>(); Counter<BinaryRule> binaryRuleCounter = new Counter<BinaryRule>(); Counter<String> symbolCounter = new Counter<String>(); for (Tree<String> trainTree : trainTrees) { tallyTree(trainTree, symbolCounter, unaryRuleCounter, binaryRuleCounter); } for (UnaryRule unaryRule : unaryRuleCounter.keySet()) { double unaryProbability = unaryRuleCounter.getCount(unaryRule) / symbolCounter.getCount(unaryRule.getParent()); unaryRule.setScore(unaryProbability); addUnary(unaryRule); } for (BinaryRule binaryRule : binaryRuleCounter.keySet()) { double binaryProbability = binaryRuleCounter.getCount(binaryRule) / symbolCounter.getCount(binaryRule.getParent()); binaryRule.setScore(binaryProbability); addBinary(binaryRule); } }