private void tallyTree( Tree<String> tree, Counter<String> symbolCounter, Counter<UnaryRule> unaryRuleCounter, Counter<BinaryRule> binaryRuleCounter) { if (tree.isLeaf()) return; if (tree.isPreTerminal()) return; if (tree.getChildren().size() == 1) { UnaryRule unaryRule = makeUnaryRule(tree); symbolCounter.incrementCount(tree.getLabel(), 1.0); unaryRuleCounter.incrementCount(unaryRule, 1.0); } if (tree.getChildren().size() == 2) { BinaryRule binaryRule = makeBinaryRule(tree); symbolCounter.incrementCount(tree.getLabel(), 1.0); binaryRuleCounter.incrementCount(binaryRule, 1.0); } if (tree.getChildren().size() < 1 || tree.getChildren().size() > 2) { throw new RuntimeException( "Attempted to construct a Grammar with an illegal tree: " + tree); } for (Tree<String> child : tree.getChildren()) { tallyTree(child, symbolCounter, unaryRuleCounter, binaryRuleCounter); } }
private void tallyTagging(String word, String tag) { if (!isKnown(word)) { totalWordTypes += 1.0; typeTagCounter.incrementCount(tag, 1.0); } totalTokens += 1.0; tagCounter.incrementCount(tag, 1.0); wordCounter.incrementCount(word, 1.0); wordToTagCounters.incrementCount(word, tag, 1.0); }
/* Returns a smoothed estimate of P(word|tag) */ public double scoreTagging(String word, String tag) { double p_tag = tagCounter.getCount(tag) / totalTokens; double c_word = wordCounter.getCount(word); double c_tag_and_word = wordToTagCounters.getCount(word, tag); if (c_word < 10) { // rare or unknown c_word += 1.0; c_tag_and_word += typeTagCounter.getCount(tag) / totalWordTypes; } double p_word = (1.0 + c_word) / (totalTokens + totalWordTypes); double p_tag_given_word = c_tag_and_word / c_word; return p_tag_given_word / p_tag * p_word; }
/* A builds PCFG using the observed counts of binary and unary * productions in the training trees to estimate the probabilities * for those rules. */ public Grammar(List<Tree<String>> trainTrees) { Counter<UnaryRule> unaryRuleCounter = new Counter<UnaryRule>(); Counter<BinaryRule> binaryRuleCounter = new Counter<BinaryRule>(); Counter<String> symbolCounter = new Counter<String>(); for (Tree<String> trainTree : trainTrees) { tallyTree(trainTree, symbolCounter, unaryRuleCounter, binaryRuleCounter); } for (UnaryRule unaryRule : unaryRuleCounter.keySet()) { double unaryProbability = unaryRuleCounter.getCount(unaryRule) / symbolCounter.getCount(unaryRule.getParent()); unaryRule.setScore(unaryProbability); addUnary(unaryRule); } for (BinaryRule binaryRule : binaryRuleCounter.keySet()) { double binaryProbability = binaryRuleCounter.getCount(binaryRule) / symbolCounter.getCount(binaryRule.getParent()); binaryRule.setScore(binaryProbability); addBinary(binaryRule); } }
public boolean isKnown(String word) { return wordCounter.keySet().contains(word); }
public Set<String> getAllTags() { return tagCounter.keySet(); }