private Tree<String> merge(Tree<String> leftTree, Tree<String> rightTree) { int span = leftTree.getYield().size() + rightTree.getYield().size(); String mostFrequentLabel = spanToCategories.getCounter(span).argMax(); List<Tree<String>> children = new ArrayList<Tree<String>>(); children.add(leftTree); children.add(rightTree); return new Tree<String>(mostFrequentLabel, children); }
private void tallyTagging(String word, String tag) { if (!isKnown(word)) { totalWordTypes += 1.0; typeTagCounter.incrementCount(tag, 1.0); } totalTokens += 1.0; tagCounter.incrementCount(tag, 1.0); wordCounter.incrementCount(word, 1.0); wordToTagCounters.incrementCount(word, tag, 1.0); }
private int tallySpans(Tree<String> tree, int start) { if (tree.isLeaf() || tree.isPreTerminal()) return 1; int end = start; for (Tree<String> child : tree.getChildren()) { int childSpan = tallySpans(child, end); end += childSpan; } String category = tree.getLabel(); if (!category.equals("ROOT")) spanToCategories.incrementCount(end - start, category, 1.0); return end - start; }
/* Returns a smoothed estimate of P(word|tag) */ public double scoreTagging(String word, String tag) { double p_tag = tagCounter.getCount(tag) / totalTokens; double c_word = wordCounter.getCount(word); double c_tag_and_word = wordToTagCounters.getCount(word, tag); if (c_word < 10) { // rare or unknown c_word += 1.0; c_tag_and_word += typeTagCounter.getCount(tag) / totalWordTypes; } double p_word = (1.0 + c_word) / (totalTokens + totalWordTypes); double p_tag_given_word = c_tag_and_word / c_word; return p_tag_given_word / p_tag * p_word; }