/** * Return the probability (as a real number between 0 and 1) of stopping rather than generating * another argument at this position. * * @param dependency The dependency used as the basis for stopping on. Tags are assumed to be in * the TagProjection space. * @return The probability of generating this stop probability */ protected double getStopProb(IntDependency dependency) { short binDistance = distanceBin(dependency.distance); IntTaggedWord unknownHead = new IntTaggedWord(-1, dependency.head.tag); IntTaggedWord anyHead = new IntTaggedWord(ANY_WORD_INT, dependency.head.tag); IntDependency temp = new IntDependency(dependency.head, stopTW, dependency.leftHeaded, binDistance); double c_stop_hTWds = stopCounter.getCount(temp); temp = new IntDependency(unknownHead, stopTW, dependency.leftHeaded, binDistance); double c_stop_hTds = stopCounter.getCount(temp); temp = new IntDependency(dependency.head, wildTW, dependency.leftHeaded, binDistance); double c_hTWds = stopCounter.getCount(temp); temp = new IntDependency(anyHead, wildTW, dependency.leftHeaded, binDistance); double c_hTds = stopCounter.getCount(temp); double p_stop_hTds = (c_hTds > 0.0 ? c_stop_hTds / c_hTds : 1.0); double pb_stop_hTWds = (c_stop_hTWds + smooth_stop * p_stop_hTds) / (c_hTWds + smooth_stop); if (verbose) { System.out.println( " c_stop_hTWds: " + c_stop_hTWds + "; c_hTWds: " + c_hTWds + "; c_stop_hTds: " + c_stop_hTds + "; c_hTds: " + c_hTds); System.out.println(" Generate STOP prob: " + pb_stop_hTWds); } return pb_stop_hTWds; }
private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { stream.defaultReadObject(); // System.err.println("Before decompression:"); // System.err.println("arg size: " + argCounter.size() + " total: " + // argCounter.totalCount()); // System.err.println("stop size: " + stopCounter.size() + " total: " + // stopCounter.totalCount()); ClassicCounter<IntDependency> compressedArgC = argCounter; argCounter = new ClassicCounter<IntDependency>(); ClassicCounter<IntDependency> compressedStopC = stopCounter; stopCounter = new ClassicCounter<IntDependency>(); for (IntDependency d : compressedArgC.keySet()) { double count = compressedArgC.getCount(d); expandArg(d, d.distance, count); } for (IntDependency d : compressedStopC.keySet()) { double count = compressedStopC.getCount(d); expandStop(d, d.distance, count, false); } // System.err.println("After decompression:"); // System.err.println("arg size: " + argCounter.size() + " total: " + // argCounter.totalCount()); // System.err.println("stop size: " + stopCounter.size() + " total: " + // stopCounter.totalCount()); expandDependencyMap = null; }
/** Writes out data from this Object to the Writer w. */ @Override public void writeData(PrintWriter out) throws IOException { // all lines have one rule per line for (IntDependency dependency : argCounter.keySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { double count = argCounter.getCount(dependency); out.println(dependency.toString(wordIndex, tagIndex) + " " + count); } } out.println("BEGIN_STOP"); for (IntDependency dependency : stopCounter.keySet()) { if (dependency.head.word != -1) { double count = stopCounter.getCount(dependency); out.println(dependency.toString(wordIndex, tagIndex) + " " + count); } } out.flush(); }
/** * Evaluates how many words (= terminals) in a collection of trees are covered by the lexicon. * First arg is the collection of trees; second through fourth args get the results. Currently * unused; this probably only works if train and test at same time so tags and words variables are * initialized. */ public double evaluateCoverage( Collection<Tree> trees, Set<String> missingWords, Set<String> missingTags, Set<IntTaggedWord> missingTW) { List<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>(); for (Tree t : trees) { iTW1.addAll(treeToEvents(t)); } int total = 0; int unseen = 0; for (IntTaggedWord itw : iTW1) { total++; if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) { missingWords.add(wordIndex.get(itw.word())); } if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) { missingTags.add(tagIndex.get(itw.tag())); } // if (!rules.contains(itw)) { if (seenCounter.getCount(itw) == 0.0) { unseen++; missingTW.add(itw); } } return (double) unseen / total; }
private static <T> void display(ClassicCounter<T> c, PrintWriter pw) { List<T> cats = new ArrayList<>(c.keySet()); Collections.sort(cats, Counters.toComparatorDescending(c)); for (T ob : cats) { pw.println(ob + " " + c.getCount(ob)); } }
/** * This records how likely it is for a word with one tag to also have another tag. This won't work * after serialization/deserialization, but that is how it is currently called.... */ void buildPT_T() { int numTags = tagIndex.size(); m_TT = new double[numTags][numTags]; m_T = new double[numTags]; double[] tmp = new double[numTags]; for (IntTaggedWord word : words) { double tot = 0.0; for (int t = 0; t < numTags; t++) { IntTaggedWord iTW = new IntTaggedWord(word.word, t); tmp[t] = seenCounter.getCount(iTW); tot += tmp[t]; } if (tot < 10) { continue; } for (int t = 0; t < numTags; t++) { for (int t2 = 0; t2 < numTags; t2++) { if (tmp[t2] > 0.0) { double c = tmp[t] / tot; m_T[t] += c; m_TT[t2][t] += c; } } } } }
private static <T> void display(ClassicCounter<T> c, int num, PrintWriter pw) { List<T> rules = new ArrayList<>(c.keySet()); Collections.sort(rules, Counters.toComparatorDescending(c)); int rSize = rules.size(); if (num > rSize) { num = rSize; } for (int i = 0; i < num; i++) { pw.println(rules.get(i) + " " + c.getCount(rules.get(i))); } }
public double countHistory(IntDependency dependency) { IntDependency temp = new IntDependency( dependency.head.word, tagBin(dependency.head.tag), wildTW.word, wildTW.tag, dependency.leftHeaded, valenceBin(dependency.distance)); return argCounter.getCount(temp); }
/** @param <T> */ public static <T> List<FeatureValue<T>> combine(Collection<FeatureValue<T>> featureValues) { ClassicCounter<T> counter = new ClassicCounter<T>(); for (FeatureValue<T> fv : featureValues) { counter.incrementCount(fv.name, fv.value); } Set<T> keys = new TreeSet<T>(counter.keySet()); List<FeatureValue<T>> featureList = new ArrayList<FeatureValue<T>>(keys.size()); for (T key : keys) { featureList.add(new FeatureValue<T>(key, counter.getCount(key))); } return featureList; }
/** * Generate the possible taggings for a word at a sentence position. This may either be based on a * strict lexicon or an expanded generous set of possible taggings. * * <p><i>Implementation note:</i> Expanded sets of possible taggings are calculated dynamically at * runtime, so as to reduce the memory used by the lexicon (a space/time tradeoff). * * @param word The word (as an int) * @param loc Its index in the sentence (usually only relevant for unknown words) * @return A list of possible taggings */ public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec) { // if (rulesWithWord == null) { // tested in isKnown already // initRulesWithWord(); // } List<IntTaggedWord> wordTaggings; if (isKnown(word)) { if (!flexiTag) { // Strict lexical tagging for seen items wordTaggings = rulesWithWord[word]; } else { /* Allow all tags with same basicCategory */ /* Allow all scored taggings, unless very common */ IntTaggedWord iW = new IntTaggedWord(word, nullTag); if (seenCounter.getCount(iW) > smoothInUnknownsThreshold) { return rulesWithWord[word].iterator(); } else { // give it flexible tagging not just lexicon wordTaggings = new ArrayList<IntTaggedWord>(40); for (IntTaggedWord iTW2 : tags) { IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag); if (score(iTW, loc, wordIndex.get(word)) > Float.NEGATIVE_INFINITY) { wordTaggings.add(iTW); } } } } } else { // we copy list so we can insert correct word in each item wordTaggings = new ArrayList<IntTaggedWord>(40); for (IntTaggedWord iTW : rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)]) { wordTaggings.add(new IntTaggedWord(word, iTW.tag)); } } if (DEBUG_LEXICON) { EncodingPrintWriter.err.println( "Lexicon: " + wordIndex.get(word) + " (" + (isKnown(word) ? "known" : "unknown") + ", loc=" + loc + ", n=" + (isKnown(word) ? word : wordIndex.indexOf(UNKNOWN_WORD)) + ") " + (flexiTag ? "flexi" : "lexicon") + " taggings: " + wordTaggings, "UTF-8"); } return wordTaggings.iterator(); }
private void writeObject(ObjectOutputStream stream) throws IOException { // System.err.println("\nBefore compression:"); // System.err.println("arg size: " + argCounter.size() + " total: " + // argCounter.totalCount()); // System.err.println("stop size: " + stopCounter.size() + " total: " + // stopCounter.totalCount()); ClassicCounter<IntDependency> fullArgCounter = argCounter; argCounter = new ClassicCounter<IntDependency>(); for (IntDependency dependency : fullArgCounter.keySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { argCounter.incrementCount(dependency, fullArgCounter.getCount(dependency)); } } ClassicCounter<IntDependency> fullStopCounter = stopCounter; stopCounter = new ClassicCounter<IntDependency>(); for (IntDependency dependency : fullStopCounter.keySet()) { if (dependency.head.word != -1) { stopCounter.incrementCount(dependency, fullStopCounter.getCount(dependency)); } } // System.err.println("After compression:"); // System.err.println("arg size: " + argCounter.size() + " total: " + // argCounter.totalCount()); // System.err.println("stop size: " + stopCounter.size() + " total: " + // stopCounter.totalCount()); stream.defaultWriteObject(); argCounter = fullArgCounter; stopCounter = fullStopCounter; }
/** * Writes out data from this Object to the Writer w. Rules are separated by newline, and rule * elements are delimited by \t. */ public void writeData(Writer w) throws IOException { PrintWriter out = new PrintWriter(w); for (IntTaggedWord itw : seenCounter.keySet()) { out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " SEEN " + seenCounter.getCount(itw)); } for (IntTaggedWord itw : getUnknownWordModel().unSeenCounter().keySet()) { out.println( itw.toLexicalEntry(wordIndex, tagIndex) + " UNSEEN " + getUnknownWordModel().unSeenCounter().getCount(itw)); } for (int i = 0; i < smooth.length; i++) { out.println("smooth[" + i + "] = " + smooth[i]); } out.flush(); }
@Override public DependencyGrammar formResult() { wordIndex.indexOf(Lexicon.UNKNOWN_WORD, true); MLEDependencyGrammar dg = new MLEDependencyGrammar( tlpParams, directional, useDistance, useCoarseDistance, basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex); for (IntDependency dependency : dependencyCounter.keySet()) { dg.addRule(dependency, dependencyCounter.getCount(dependency)); } return dg; }
/** Trains this UWM on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 1.5) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); if (DOCUMENT_UNKNOWNS) { String wStr = wordIndex.get(iTW.word); String tStr = tagIndex.get(iTW.tag); String sStr = wordIndex.get(s); EncodingPrintWriter.err.println( "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8"); } IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); // rules.add(iTS); // sigs.add(iS); } // else { // if (seenCounter.getCount(iTW) < 2) { // it's a new tag for a known word // do nothing for now // } // } } }
/** * Checks whether a word is in the lexicon. This version works even while compiling lexicon with * current counters (rather than using the compiled rulesWithWord array). * * <p>TODO: The previous version would insert rules into the wordNumberer. Is that the desired * behavior? Why not test in some way that doesn't affect the index? For example, start by testing * wordIndex.contains(word). * * @param word The word as a String * @return Whether the word is in the lexicon */ public boolean isKnown(String word) { if (!wordIndex.contains(word)) return false; IntTaggedWord iW = new IntTaggedWord(wordIndex.indexOf(word), nullTag); return seenCounter.getCount(iW) > 0.0; }
/** * Get the score of this word with this tag (as an IntTaggedWord) at this location. (Presumably an * estimate of P(word | tag).) * * <p><i>Implementation documentation:</i> Seen: c_W = count(W) c_TW = count(T,W) c_T = count(T) * c_Tunseen = count(T) among new words in 2nd half total = count(seen words) totalUnseen = * count("unseen" words) p_T_U = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W > * smoothInUnknownsThreshold) = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior * smooth[1] with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule] * Note that this doesn't really properly reserve mass to unknowns. * * <p>Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) c_U = totalUnseen * above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]] * pb_W_T = log(P(W|T)) inverted * * @param iTW An IntTaggedWord pairing a word and POS tag * @param loc The position in the sentence. <i>In the default implementation this is used only for * unknown words to change their probability distribution when sentence initial</i> * @return A float score, usually, log P(word|tag) */ public float score(IntTaggedWord iTW, int loc, String word) { // both actual double c_TW = seenCounter.getCount(iTW); // double x_TW = xferCounter.getCount(iTW); IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag); // word counts double c_W = seenCounter.getCount(temp); // double x_W = xferCounter.getCount(temp); // totals double total = seenCounter.getCount(NULL_ITW); double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW); temp = new IntTaggedWord(nullWord, iTW.tag); // tag counts double c_T = seenCounter.getCount(temp); double c_Tunseen = uwModel.unSeenCounter().getCount(temp); double pb_W_T; // always set below if (DEBUG_LEXICON) { // dump info about last word if (iTW.word != debugLastWord) { if (debugLastWord >= 0 && debugPrefix != null) { // the 2nd conjunct in test above handles older serialized files EncodingPrintWriter.err.println(debugPrefix + debugProbs + debugNoProbs, "UTF-8"); } } } boolean seen = (c_W > 0.0); if (seen) { // known word model for P(T|W) if (DEBUG_LEXICON_SCORE) { System.err.println( "Lexicon.score " + wordIndex.get(iTW.word) + "/" + tagIndex.get(iTW.tag) + " as known word."); } // c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this] // c_TW += 0.5; double p_T_U; if (useSignatureForKnownSmoothing) { // only works for English currently p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word); if (DEBUG_LEXICON_SCORE) System.err.println( "With useSignatureForKnownSmoothing, P(T|U) is " + p_T_U + " rather than " + (c_Tunseen / totalUnseen)); } else { p_T_U = c_Tunseen / totalUnseen; } double pb_T_W; // always set below if (DEBUG_LEXICON_SCORE) { System.err.println( "c_W is " + c_W + " mle = " + (c_TW / c_W) + " smoothInUnknownsThresh is " + smoothInUnknownsThreshold + " base p_T_U is " + c_Tunseen + "/" + totalUnseen + " = " + p_T_U); } if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) { // we've seen the word enough times to have confidence in its tagging pb_T_W = c_TW / c_W; } else { // we haven't seen the word enough times to have confidence in its // tagging if (smartMutation) { int numTags = tagIndex.size(); if (m_TT == null || numTags != m_T.length) { buildPT_T(); } p_T_U *= 0.1; // System.out.println("Checking "+iTW); for (int t = 0; t < numTags; t++) { IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t); double p_T_W2 = seenCounter.getCount(iTW2) / c_W; if (p_T_W2 > 0) { // System.out.println(" Observation of "+tagIndex.get(t)+" // ("+seenCounter.getCount(iTW2)+") mutated to // "+tagIndex.get(iTW.tag)+" at rate // "+(m_TT[tag][t]/m_T[t])); p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9; } } } if (DEBUG_LEXICON_SCORE) { System.err.println("c_TW = " + c_TW + " c_W = " + c_W + " p_T_U = " + p_T_U); } // double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W); pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]); } double p_T = (c_T / total); double p_W = (c_W / total); pb_W_T = Math.log(pb_T_W * p_W / p_T); if (DEBUG_LEXICON) { if (iTW.word != debugLastWord) { debugLastWord = iTW.word; debugLoc = loc; debugProbs = new StringBuilder(); debugNoProbs = new StringBuilder("impossible: "); debugPrefix = "Lexicon: " + wordIndex.get(debugLastWord) + " (known): "; } if (pb_W_T > Double.NEGATIVE_INFINITY) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(3); debugProbs.append( tagIndex.get(iTW.tag) + ": cTW=" + c_TW + " c_T=" + c_T + " pb_T_W=" + nf.format(pb_T_W) + " log pb_W_T=" + nf.format(pb_W_T) + ", "); // debugProbs.append("\n" + "smartMutation=" + smartMutation + " // smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + " // smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U // + " c_W=" + c_W); } else { debugNoProbs.append(tagIndex.get(iTW.tag)).append(' '); } } // end if (DEBUG_LEXICON) } else { // when unseen if (loc >= 0) { pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word); } else { // For negative we now do a weighted average for the dependency grammar :-) double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word); double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word); pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T)) / 3); } } // Categorical cutoff if score is too low if (pb_W_T > -100.0) { return (float) pb_W_T; } return Float.NEGATIVE_INFINITY; } // end score()
/** * Calculate the probability of a dependency as a real probability between 0 and 1 inclusive. * * @param dependency The dependency for which the probability is to be calculated. The tags in * this dependency are in the reduced TagProjection space. * @return The probability of the dependency */ protected double probTB(IntDependency dependency) { if (verbose) { // System.out.println("tagIndex: " + tagIndex); System.err.println("Generating " + dependency); } boolean leftHeaded = dependency.leftHeaded && directional; int hW = dependency.head.word; int aW = dependency.arg.word; short hT = dependency.head.tag; short aT = dependency.arg.tag; IntTaggedWord aTW = dependency.arg; IntTaggedWord hTW = dependency.head; boolean isRoot = rootTW(dependency.head); double pb_stop_hTWds; if (isRoot) { pb_stop_hTWds = 0.0; } else { pb_stop_hTWds = getStopProb(dependency); } if (dependency.arg.word == STOP_WORD_INT) { // did we generate stop? return pb_stop_hTWds; } double pb_go_hTWds = 1.0 - pb_stop_hTWds; // generate the argument short binDistance = valenceBin(dependency.distance); // KEY: // c_ count of (read as joint count of first and second) // p_ MLE prob of (or MAP if useSmoothTagProjection) // pb_ MAP prob of (read as prob of first given second thing) // a arg // h head // T tag // PT projected tag // W word // d direction // ds distance (implicit: there when direction is mentioned!) IntTaggedWord anyHead = new IntTaggedWord(ANY_WORD_INT, dependency.head.tag); IntTaggedWord anyArg = new IntTaggedWord(ANY_WORD_INT, dependency.arg.tag); IntTaggedWord anyTagArg = new IntTaggedWord(dependency.arg.word, ANY_TAG_INT); IntDependency temp = new IntDependency(dependency.head, dependency.arg, leftHeaded, binDistance); double c_aTW_hTWd = argCounter.getCount(temp); temp = new IntDependency(dependency.head, anyArg, leftHeaded, binDistance); double c_aT_hTWd = argCounter.getCount(temp); temp = new IntDependency(dependency.head, wildTW, leftHeaded, binDistance); double c_hTWd = argCounter.getCount(temp); temp = new IntDependency(anyHead, dependency.arg, leftHeaded, binDistance); double c_aTW_hTd = argCounter.getCount(temp); temp = new IntDependency(anyHead, anyArg, leftHeaded, binDistance); double c_aT_hTd = argCounter.getCount(temp); temp = new IntDependency(anyHead, wildTW, leftHeaded, binDistance); double c_hTd = argCounter.getCount(temp); // for smooth tag projection short aPT = Short.MIN_VALUE; double c_aPTW_hPTd = Double.NaN; double c_aPT_hPTd = Double.NaN; double c_hPTd = Double.NaN; double c_aPTW_aPT = Double.NaN; double c_aPT = Double.NaN; if (useSmoothTagProjection) { aPT = tagProject(dependency.arg.tag); short hPT = tagProject(dependency.head.tag); IntTaggedWord projectedArg = new IntTaggedWord(dependency.arg.word, aPT); IntTaggedWord projectedAnyHead = new IntTaggedWord(ANY_WORD_INT, hPT); IntTaggedWord projectedAnyArg = new IntTaggedWord(ANY_WORD_INT, aPT); temp = new IntDependency(projectedAnyHead, projectedArg, leftHeaded, binDistance); c_aPTW_hPTd = argCounter.getCount(temp); temp = new IntDependency(projectedAnyHead, projectedAnyArg, leftHeaded, binDistance); c_aPT_hPTd = argCounter.getCount(temp); temp = new IntDependency(projectedAnyHead, wildTW, leftHeaded, binDistance); c_hPTd = argCounter.getCount(temp); temp = new IntDependency(wildTW, projectedArg, false, ANY_DISTANCE_INT); c_aPTW_aPT = argCounter.getCount(temp); temp = new IntDependency(wildTW, projectedAnyArg, false, ANY_DISTANCE_INT); c_aPT = argCounter.getCount(temp); } // wild head is always directionless and no use distance temp = new IntDependency(wildTW, dependency.arg, false, ANY_DISTANCE_INT); double c_aTW = argCounter.getCount(temp); temp = new IntDependency(wildTW, anyArg, false, ANY_DISTANCE_INT); double c_aT = argCounter.getCount(temp); temp = new IntDependency(wildTW, anyTagArg, false, ANY_DISTANCE_INT); double c_aW = argCounter.getCount(temp); // do the Bayesian magic // MLE probs double p_aTW_hTd; double p_aT_hTd; double p_aTW_aT; double p_aW; double p_aPTW_aPT; double p_aPTW_hPTd; double p_aPT_hPTd; // backoffs either mle or themselves bayesian smoothed depending on useSmoothTagProjection if (useSmoothTagProjection) { if (useUnigramWordSmoothing) { p_aW = c_aW > 0.0 ? (c_aW / numWordTokens) : 1.0; // NEED this 1.0 for unknown words!!! p_aPTW_aPT = (c_aPTW_aPT + smooth_aPTW_aPT * p_aW) / (c_aPT + smooth_aPTW_aPT); } else { p_aPTW_aPT = c_aPTW_aPT > 0.0 ? (c_aPTW_aPT / c_aPT) : 1.0; // NEED this 1.0 for unknown words!!! } p_aTW_aT = (c_aTW + smooth_aTW_aT * p_aPTW_aPT) / (c_aT + smooth_aTW_aT); p_aPTW_hPTd = c_hPTd > 0.0 ? (c_aPTW_hPTd / c_hPTd) : 0.0; p_aTW_hTd = (c_aTW_hTd + smooth_aTW_hTd * p_aPTW_hPTd) / (c_hTd + smooth_aTW_hTd); p_aPT_hPTd = c_hPTd > 0.0 ? (c_aPT_hPTd / c_hPTd) : 0.0; p_aT_hTd = (c_aT_hTd + smooth_aT_hTd * p_aPT_hPTd) / (c_hTd + smooth_aT_hTd); } else { // here word generation isn't smoothed - can't get previously unseen word with tag. Ugh. if (op.testOptions.useLexiconToScoreDependencyPwGt) { // We don't know the position. Now -1 means average over 0 and 1. p_aTW_aT = dependency.leftHeaded ? Math.exp(lex.score(dependency.arg, 1, wordIndex.get(dependency.arg.word))) : Math.exp(lex.score(dependency.arg, -1, wordIndex.get(dependency.arg.word))); // double oldScore = c_aTW > 0.0 ? (c_aTW / c_aT) : 1.0; // if (oldScore == 1.0) { // System.err.println("#### arg=" + dependency.arg + " score=" + p_aTW_aT + // " oldScore=" + oldScore + " c_aTW=" + c_aTW + " c_aW=" + c_aW); // } } else { p_aTW_aT = c_aTW > 0.0 ? (c_aTW / c_aT) : 1.0; } p_aTW_hTd = c_hTd > 0.0 ? (c_aTW_hTd / c_hTd) : 0.0; p_aT_hTd = c_hTd > 0.0 ? (c_aT_hTd / c_hTd) : 0.0; } double pb_aTW_hTWd = (c_aTW_hTWd + smooth_aTW_hTWd * p_aTW_hTd) / (c_hTWd + smooth_aTW_hTWd); double pb_aT_hTWd = (c_aT_hTWd + smooth_aT_hTWd * p_aT_hTd) / (c_hTWd + smooth_aT_hTWd); double score = (interp * pb_aTW_hTWd + (1.0 - interp) * p_aTW_aT * pb_aT_hTWd) * pb_go_hTWds; if (verbose) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(2); if (useSmoothTagProjection) { if (useUnigramWordSmoothing) { System.err.println( " c_aW=" + c_aW + ", numWordTokens=" + numWordTokens + ", p(aW)=" + nf.format(p_aW)); } System.err.println( " c_aPTW_aPT=" + c_aPTW_aPT + ", c_aPT=" + c_aPT + ", smooth_aPTW_aPT=" + smooth_aPTW_aPT + ", p(aPTW|aPT)=" + nf.format(p_aPTW_aPT)); } System.err.println( " c_aTW=" + c_aTW + ", c_aT=" + c_aT + ", smooth_aTW_aT=" + smooth_aTW_aT + ", ## p(aTW|aT)=" + nf.format(p_aTW_aT)); if (useSmoothTagProjection) { System.err.println( " c_aPTW_hPTd=" + c_aPTW_hPTd + ", c_hPTd=" + c_hPTd + ", p(aPTW|hPTd)=" + nf.format(p_aPTW_hPTd)); } System.err.println( " c_aTW_hTd=" + c_aTW_hTd + ", c_hTd=" + c_hTd + ", smooth_aTW_hTd=" + smooth_aTW_hTd + ", p(aTW|hTd)=" + nf.format(p_aTW_hTd)); if (useSmoothTagProjection) { System.err.println( " c_aPT_hPTd=" + c_aPT_hPTd + ", c_hPTd=" + c_hPTd + ", p(aPT|hPTd)=" + nf.format(p_aPT_hPTd)); } System.err.println( " c_aT_hTd=" + c_aT_hTd + ", c_hTd=" + c_hTd + ", smooth_aT_hTd=" + smooth_aT_hTd + ", p(aT|hTd)=" + nf.format(p_aT_hTd)); System.err.println( " c_aTW_hTWd=" + c_aTW_hTWd + ", c_hTWd=" + c_hTWd + ", smooth_aTW_hTWd=" + smooth_aTW_hTWd + ", ## p(aTW|hTWd)=" + nf.format(pb_aTW_hTWd)); System.err.println( " c_aT_hTWd=" + c_aT_hTWd + ", c_hTWd=" + c_hTWd + ", smooth_aT_hTWd=" + smooth_aT_hTWd + ", ## p(aT|hTWd)=" + nf.format(pb_aT_hTWd)); System.err.println( " interp=" + interp + ", prescore=" + nf.format(interp * pb_aTW_hTWd + (1.0 - interp) * p_aTW_aT * pb_aT_hTWd) + ", P(go|hTWds)=" + nf.format(pb_go_hTWds) + ", score=" + nf.format(score)); } if (op.testOptions.prunePunc && pruneTW(aTW)) { return 1.0; } if (Double.isNaN(score)) { score = 0.0; } // if (op.testOptions.rightBonus && ! dependency.leftHeaded) // score -= 0.2; if (score < MIN_PROBABILITY) { score = 0.0; } return score; }