private NaiveBayesClassifier<L, F> trainClassifier( int[][] data, int[] labels, int numFeatures, int numClasses, Index<L> labelIndex, Index<F> featureIndex) { Set<L> labelSet = Generics.newHashSet(); NBWeights nbWeights = trainWeights(data, labels, numFeatures, numClasses); Counter<L> priors = new ClassicCounter<L>(); double[] pr = nbWeights.priors; for (int i = 0; i < pr.length; i++) { priors.incrementCount(labelIndex.get(i), pr[i]); labelSet.add(labelIndex.get(i)); } Counter<Pair<Pair<L, F>, Number>> weightsCounter = new ClassicCounter<Pair<Pair<L, F>, Number>>(); double[][][] wts = nbWeights.weights; for (int c = 0; c < numClasses; c++) { L label = labelIndex.get(c); for (int f = 0; f < numFeatures; f++) { F feature = featureIndex.get(f); Pair<L, F> p = new Pair<L, F>(label, feature); for (int val = 0; val < wts[c][f].length; val++) { Pair<Pair<L, F>, Number> key = new Pair<Pair<L, F>, Number>(p, Integer.valueOf(val)); weightsCounter.incrementCount(key, wts[c][f][val]); } } } return new NaiveBayesClassifier<L, F>(weightsCounter, priors, labelSet); }
/** * Evaluates how many words (= terminals) in a collection of trees are covered by the lexicon. * First arg is the collection of trees; second through fourth args get the results. Currently * unused; this probably only works if train and test at same time so tags and words variables are * initialized. */ public double evaluateCoverage( Collection<Tree> trees, Set<String> missingWords, Set<String> missingTags, Set<IntTaggedWord> missingTW) { List<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>(); for (Tree t : trees) { iTW1.addAll(treeToEvents(t)); } int total = 0; int unseen = 0; for (IntTaggedWord itw : iTW1) { total++; if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) { missingWords.add(wordIndex.get(itw.word())); } if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) { missingTags.add(tagIndex.get(itw.tag())); } // if (!rules.contains(itw)) { if (seenCounter.getCount(itw) == 0.0) { unseen++; missingTW.add(itw); } } return (double) unseen / total; }
/** * Generate the possible taggings for a word at a sentence position. This may either be based on a * strict lexicon or an expanded generous set of possible taggings. * * <p><i>Implementation note:</i> Expanded sets of possible taggings are calculated dynamically at * runtime, so as to reduce the memory used by the lexicon (a space/time tradeoff). * * @param word The word (as an int) * @param loc Its index in the sentence (usually only relevant for unknown words) * @return A list of possible taggings */ public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec) { // if (rulesWithWord == null) { // tested in isKnown already // initRulesWithWord(); // } List<IntTaggedWord> wordTaggings; if (isKnown(word)) { if (!flexiTag) { // Strict lexical tagging for seen items wordTaggings = rulesWithWord[word]; } else { /* Allow all tags with same basicCategory */ /* Allow all scored taggings, unless very common */ IntTaggedWord iW = new IntTaggedWord(word, nullTag); if (seenCounter.getCount(iW) > smoothInUnknownsThreshold) { return rulesWithWord[word].iterator(); } else { // give it flexible tagging not just lexicon wordTaggings = new ArrayList<IntTaggedWord>(40); for (IntTaggedWord iTW2 : tags) { IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag); if (score(iTW, loc, wordIndex.get(word)) > Float.NEGATIVE_INFINITY) { wordTaggings.add(iTW); } } } } } else { // we copy list so we can insert correct word in each item wordTaggings = new ArrayList<IntTaggedWord>(40); for (IntTaggedWord iTW : rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)]) { wordTaggings.add(new IntTaggedWord(word, iTW.tag)); } } if (DEBUG_LEXICON) { EncodingPrintWriter.err.println( "Lexicon: " + wordIndex.get(word) + " (" + (isKnown(word) ? "known" : "unknown") + ", loc=" + loc + ", n=" + (isKnown(word) ? word : wordIndex.indexOf(UNKNOWN_WORD)) + ") " + (flexiTag ? "flexi" : "lexicon") + " taggings: " + wordTaggings, "UTF-8"); } return wordTaggings.iterator(); }
public SimpleSequence(int[] intElements, Index<T> index) { elements = new Object[intElements.length]; for (int i = 0; i < intElements.length; i++) { elements[i] = index.get(intElements[i]); } start = 0; end = intElements.length; }
/** * Provides some testing and opportunities for exploration of the probabilities of a BaseLexicon. * What's here currently probably only works for the English Penn Treeebank, as it uses default * constructors. Of the words given to test on, the first is treated as sentence initial, and the * rest as not sentence initial. * * @param args The command line arguments: java BaseLexicon treebankPath fileRange * unknownWordModel words* */ public static void main(String[] args) { if (args.length < 3) { System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*"); return; } System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... "); Treebank tb = new DiskTreebank(); tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true)); // TODO: change this interface so the lexicon creates its own indices? Index<String> wordIndex = new HashIndex<String>(); Index<String> tagIndex = new HashIndex<String>(); BaseLexicon lex = new BaseLexicon(wordIndex, tagIndex); lex.getUnknownWordModel().setUnknownLevel(Integer.parseInt(args[2])); lex.train(tb); System.out.println("done."); System.out.println(); NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(4); List<String> impos = new ArrayList<String>(); for (int i = 3; i < args.length; i++) { if (lex.isKnown(args[i])) { System.out.println( args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:"); for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.indexOf(args[i], true), i - 3, null); it.hasNext(); ) { IntTaggedWord iTW = it.next(); System.out.println( StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word)))); } } else { String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3); System.out.println( args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig); impos.clear(); List<String> lis = new ArrayList<String>(tagIndex.objectsList()); Collections.sort(lis); for (String tStr : lis) { IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex); double score = lex.score(iTW, 1, args[i]); if (score == Float.NEGATIVE_INFINITY) { impos.add(tStr); } else { System.out.println(StringUtils.pad(iTW, 24) + nf.format(score)); } } if (impos.size() > 0) { System.out.println(args[i] + " impossible tags: " + impos); } } System.out.println(); } }
private void populateTagsToBaseTags(TreebankLanguagePack tlp) { int total = tagIndex.size(); tagsToBaseTags = new int[total]; for (int i = 0; i < total; i++) { String tag = tagIndex.get(i); String baseTag = tlp.basicCategory(tag); int j = tagIndex.indexOf(baseTag, true); tagsToBaseTags[i] = j; } }
private short tagProject(short tag) { if (smoothTPIndex == null) { smoothTPIndex = new HashIndex<String>(tagIndex); } if (tag < 0) { return tag; } else { String tagStr = smoothTPIndex.get(tag); String binStr = TP_PREFIX + smoothTP.project(tagStr); return (short) smoothTPIndex.indexOf(binStr, true); } }
public <F> double score(Classifier<L, F> classifier, GeneralDataset<L, F> data) { List<L> guesses = new ArrayList<L>(); List<L> labels = new ArrayList<L>(); for (int i = 0; i < data.size(); i++) { Datum<L, F> d = data.getRVFDatum(i); L guess = classifier.classOf(d); guesses.add(guess); } int[] labelsArr = data.getLabelsArray(); labelIndex = data.labelIndex; for (int i = 0; i < data.size(); i++) { labels.add(labelIndex.get(labelsArr[i])); } labelIndex = new HashIndex<L>(); labelIndex.addAll(data.labelIndex().objectsList()); labelIndex.addAll(classifier.labels()); int numClasses = labelIndex.size(); tpCount = new int[numClasses]; fpCount = new int[numClasses]; fnCount = new int[numClasses]; negIndex = labelIndex.indexOf(negLabel); for (int i = 0; i < guesses.size(); ++i) { L guess = guesses.get(i); int guessIndex = labelIndex.indexOf(guess); L label = labels.get(i); int trueIndex = labelIndex.indexOf(label); if (guessIndex == trueIndex) { if (guessIndex != negIndex) { tpCount[guessIndex]++; } } else { if (guessIndex != negIndex) { fpCount[guessIndex]++; } if (trueIndex != negIndex) { fnCount[trueIndex]++; } } } return getFMeasure(); }
/** Print some statistics about this lexicon. */ public void printLexStats() { System.out.println("BaseLexicon statistics"); System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel()); // System.out.println("Rules size: " + rules.size()); System.out.println("Sum of rulesWithWord: " + numRules()); System.out.println("Tags size: " + tags.size()); int wsize = words.size(); System.out.println("Words size: " + wsize); // System.out.println("Unseen Sigs size: " + sigs.size() + // " [number of unknown equivalence classes]"); System.out.println( "rulesWithWord length: " + rulesWithWord.length + " [should be sum of words + unknown sigs]"); int[] lengths = new int[STATS_BINS]; ArrayList<String>[] wArr = new ArrayList[STATS_BINS]; for (int j = 0; j < STATS_BINS; j++) { wArr[j] = new ArrayList<String>(); } for (int i = 0; i < rulesWithWord.length; i++) { int num = rulesWithWord[i].size(); if (num > STATS_BINS - 1) { num = STATS_BINS - 1; } lengths[num]++; if (wsize <= 20 || num >= STATS_BINS / 2) { wArr[num].add(wordIndex.get(i)); } } System.out.println("Stats on how many taggings for how many words"); for (int j = 0; j < STATS_BINS; j++) { System.out.print(j + " taggings: " + lengths[j] + " words "); if (wsize <= 20 || j >= STATS_BINS / 2) { System.out.print(wArr[j]); } System.out.println(); } NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(0); System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf)); if (wsize < 50 && tags.size() < 10) { nf.setMaximumFractionDigits(3); StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); pw.println("Tagging probabilities log P(word|tag)"); for (int t = 0; t < tags.size(); t++) { pw.print('\t'); pw.print(tagIndex.get(t)); } pw.println(); for (int w = 0; w < wsize; w++) { pw.print(wordIndex.get(w)); pw.print('\t'); for (int t = 0; t < tags.size(); t++) { IntTaggedWord iTW = new IntTaggedWord(w, t); pw.print(nf.format(score(iTW, 1, wordIndex.get(w)))); if (t == tags.size() - 1) { pw.println(); } else pw.print('\t'); } } pw.close(); System.out.println(sw.toString()); } }
/** * Get the score of this word with this tag (as an IntTaggedWord) at this location. (Presumably an * estimate of P(word | tag).) * * <p><i>Implementation documentation:</i> Seen: c_W = count(W) c_TW = count(T,W) c_T = count(T) * c_Tunseen = count(T) among new words in 2nd half total = count(seen words) totalUnseen = * count("unseen" words) p_T_U = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W > * smoothInUnknownsThreshold) = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior * smooth[1] with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule] * Note that this doesn't really properly reserve mass to unknowns. * * <p>Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) c_U = totalUnseen * above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]] * pb_W_T = log(P(W|T)) inverted * * @param iTW An IntTaggedWord pairing a word and POS tag * @param loc The position in the sentence. <i>In the default implementation this is used only for * unknown words to change their probability distribution when sentence initial</i> * @return A float score, usually, log P(word|tag) */ public float score(IntTaggedWord iTW, int loc, String word) { // both actual double c_TW = seenCounter.getCount(iTW); // double x_TW = xferCounter.getCount(iTW); IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag); // word counts double c_W = seenCounter.getCount(temp); // double x_W = xferCounter.getCount(temp); // totals double total = seenCounter.getCount(NULL_ITW); double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW); temp = new IntTaggedWord(nullWord, iTW.tag); // tag counts double c_T = seenCounter.getCount(temp); double c_Tunseen = uwModel.unSeenCounter().getCount(temp); double pb_W_T; // always set below if (DEBUG_LEXICON) { // dump info about last word if (iTW.word != debugLastWord) { if (debugLastWord >= 0 && debugPrefix != null) { // the 2nd conjunct in test above handles older serialized files EncodingPrintWriter.err.println(debugPrefix + debugProbs + debugNoProbs, "UTF-8"); } } } boolean seen = (c_W > 0.0); if (seen) { // known word model for P(T|W) if (DEBUG_LEXICON_SCORE) { System.err.println( "Lexicon.score " + wordIndex.get(iTW.word) + "/" + tagIndex.get(iTW.tag) + " as known word."); } // c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this] // c_TW += 0.5; double p_T_U; if (useSignatureForKnownSmoothing) { // only works for English currently p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word); if (DEBUG_LEXICON_SCORE) System.err.println( "With useSignatureForKnownSmoothing, P(T|U) is " + p_T_U + " rather than " + (c_Tunseen / totalUnseen)); } else { p_T_U = c_Tunseen / totalUnseen; } double pb_T_W; // always set below if (DEBUG_LEXICON_SCORE) { System.err.println( "c_W is " + c_W + " mle = " + (c_TW / c_W) + " smoothInUnknownsThresh is " + smoothInUnknownsThreshold + " base p_T_U is " + c_Tunseen + "/" + totalUnseen + " = " + p_T_U); } if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) { // we've seen the word enough times to have confidence in its tagging pb_T_W = c_TW / c_W; } else { // we haven't seen the word enough times to have confidence in its // tagging if (smartMutation) { int numTags = tagIndex.size(); if (m_TT == null || numTags != m_T.length) { buildPT_T(); } p_T_U *= 0.1; // System.out.println("Checking "+iTW); for (int t = 0; t < numTags; t++) { IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t); double p_T_W2 = seenCounter.getCount(iTW2) / c_W; if (p_T_W2 > 0) { // System.out.println(" Observation of "+tagIndex.get(t)+" // ("+seenCounter.getCount(iTW2)+") mutated to // "+tagIndex.get(iTW.tag)+" at rate // "+(m_TT[tag][t]/m_T[t])); p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9; } } } if (DEBUG_LEXICON_SCORE) { System.err.println("c_TW = " + c_TW + " c_W = " + c_W + " p_T_U = " + p_T_U); } // double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W); pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]); } double p_T = (c_T / total); double p_W = (c_W / total); pb_W_T = Math.log(pb_T_W * p_W / p_T); if (DEBUG_LEXICON) { if (iTW.word != debugLastWord) { debugLastWord = iTW.word; debugLoc = loc; debugProbs = new StringBuilder(); debugNoProbs = new StringBuilder("impossible: "); debugPrefix = "Lexicon: " + wordIndex.get(debugLastWord) + " (known): "; } if (pb_W_T > Double.NEGATIVE_INFINITY) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(3); debugProbs.append( tagIndex.get(iTW.tag) + ": cTW=" + c_TW + " c_T=" + c_T + " pb_T_W=" + nf.format(pb_T_W) + " log pb_W_T=" + nf.format(pb_W_T) + ", "); // debugProbs.append("\n" + "smartMutation=" + smartMutation + " // smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + " // smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U // + " c_W=" + c_W); } else { debugNoProbs.append(tagIndex.get(iTW.tag)).append(' '); } } // end if (DEBUG_LEXICON) } else { // when unseen if (loc >= 0) { pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word); } else { // For negative we now do a weighted average for the dependency grammar :-) double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word); double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word); pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T)) / 3); } } // Categorical cutoff if score is too low if (pb_W_T > -100.0) { return (float) pb_W_T; } return Float.NEGATIVE_INFINITY; } // end score()
protected void initRulesWithWord() { if (testOptions.verbose || DEBUG_LEXICON) { System.err.print("\nInitializing lexicon scores ... "); } // int numWords = words.size()+sigs.size()+1; int unkWord = wordIndex.indexOf(UNKNOWN_WORD, true); int numWords = wordIndex.size(); rulesWithWord = new List[numWords]; for (int w = 0; w < numWords; w++) { rulesWithWord[w] = new ArrayList<IntTaggedWord>(1); // most have 1 or 2 // items in them } // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) { tags = new HashSet<IntTaggedWord>(); for (IntTaggedWord iTW : seenCounter.keySet()) { if (iTW.word() == nullWord && iTW.tag() != nullTag) { tags.add(iTW); } } // tags for unknown words if (DEBUG_LEXICON) { System.err.println( "Lexicon initializing tags for UNKNOWN WORD (" + Lexicon.UNKNOWN_WORD + ", " + unkWord + ')'); } if (DEBUG_LEXICON) System.err.println("unSeenCounter is: " + uwModel.unSeenCounter()); if (DEBUG_LEXICON) System.err.println( "Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold); for (IntTaggedWord iT : tags) { if (DEBUG_LEXICON) System.err.println("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT)); double types = uwModel.unSeenCounter().getCount(iT); if (types > trainOptions.openClassTypesThreshold) { // Number of types before it's treated as open class IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag); rulesWithWord[iTW.word].add(iTW); } } if (testOptions.verbose || DEBUG_LEXICON) { System.err.print("The " + rulesWithWord[unkWord].size() + " open class tags are: ["); for (IntTaggedWord item : rulesWithWord[unkWord]) { System.err.print(" " + tagIndex.get(item.tag())); if (DEBUG_LEXICON) { IntTaggedWord iTprint = new IntTaggedWord(nullWord, item.tag); System.err.print( " (tag " + item.tag() + ", type count is " + uwModel.unSeenCounter().getCount(iTprint) + ')'); } } System.err.println(" ] "); } for (IntTaggedWord iTW : seenCounter.keySet()) { if (iTW.tag() != nullTag && iTW.word() != nullWord) { rulesWithWord[iTW.word].add(iTW); } } }
public String getTag(int i) { return index.get(i); }
// CDM 2007: I wonder what this does differently from segmentWordsWithMarkov??? private ArrayList<TaggedWord> basicSegmentWords(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 // best score of span double[][] scores = new double[length][length + 1]; // best (last index of) first word for this span int[][] splitBacktrace = new int[length][length + 1]; // best tag for word over this span int[][] POSbacktrace = new int[length][length + 1]; for (int i = 0; i < length; i++) { Arrays.fill(scores[i], Double.NEGATIVE_INFINITY); } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); // for (String tag : POSes) { // 1.5 for (Iterator<String> iter = POSes.iterator(); iter.hasNext(); ) { String tag = iter.next(); IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double newScore = lex.score(itw, 0, word, null) + Math.log(lex.getPOSDistribution().probabilityOf(tag)); if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = end; POSbacktrace[start][end] = itw.tag(); } } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { if (splitBacktrace[start][split] != split) { continue; // only consider words on left } double newScore = scores[start][split] + scores[split][end]; if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = split; } } } } List<TaggedWord> words = new ArrayList<TaggedWord>(); int start = 0; while (start < length) { int end = splitBacktrace[start][length]; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); String tag = tagIndex.get(POSbacktrace[start][end]); words.add(new TaggedWord(word, tag)); start = end; } return new ArrayList<TaggedWord>(words); }