/** * Trains this lexicon on the Collection of trees. Also trains the unknown word model pointed to * by this lexicon. */ public void train(Collection<Tree> trees, double weight, boolean keepTagsAsLabels) { getUnknownWordModel().train(trees); // scan data for (Tree tree : trees) { List<IntTaggedWord> taggedWords = treeToEvents(tree, keepTagsAsLabels); for (int w = 0, sz = taggedWords.size(); w < sz; w++) { IntTaggedWord iTW = taggedWords.get(w); seenCounter.incrementCount(iTW, weight); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); seenCounter.incrementCount(iT, weight); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = new IntTaggedWord(nullWord, nullTag); seenCounter.incrementCount(i, weight); // rules.add(iTW); tags.add(iT); words.add(iW); } } tune(trees); // index the possible tags for each word initRulesWithWord(); if (DEBUG_LEXICON) { printLexStats(); } }
@Override public void evaluate(Tree t1, Tree t2, PrintWriter pw) { Set<String> s1 = makeObjects(t1); Set<String> s2 = makeObjects(t2); for (String o1 : s1) { if (!s2.contains(o1)) { over.incrementCount(o1); } } for (String o2 : s2) { if (!s1.contains(o2)) { under.incrementCount(o2); } } }
public static <T> Counter<T> toCounter(List<FeatureValue<T>> featureValues) { ClassicCounter<T> counter = new ClassicCounter<T>(); for (FeatureValue<T> fv : featureValues) { counter.incrementCount(fv.name, fv.value); } return counter; }
@Override protected void tallyRoot(Tree lt, double weight) { // this list is in full (not reduced) tag space List<IntDependency> deps = MLEDependencyGrammar.treeToDependencyList(lt, wordIndex, tagIndex); for (IntDependency dependency : deps) { dependencyCounter.incrementCount(dependency, weight); } }
@Override public void evaluate(Tree t1, Tree t2, PrintWriter pw) { List<String> s1 = myMakeObjects(t1); List<String> s2 = myMakeObjects(t2); List<String> del2 = new LinkedList<>(s2); // we delete out as we find them so we can score correctly a cat with // a certain cardinality in a tree. for (String o1 : s1) { if (!del2.remove(o1)) { over.incrementCount(o1); } } for (String o2 : s2) { if (!s1.remove(o2)) { under.incrementCount(o2); } } }
/** Make a copy of the array of counters. */ public ClassicCounter<Integer>[] cloneCounter(ClassicCounter<Integer>[] counter) { ClassicCounter<Integer>[] newcount = ErasureUtils.<ClassicCounter<Integer>>mkTArray(ClassicCounter.class, counter.length); for (int xx = 0; xx < counter.length; xx++) { ClassicCounter<Integer> cc = new ClassicCounter<Integer>(); newcount[xx] = cc; for (Integer key : counter[xx].keySet()) cc.incrementCount(key, counter[xx].getCount(key)); } return newcount; }
private void expandStop( IntDependency dependency, short distBinDist, double count, boolean wildForStop) { IntTaggedWord headT = getCachedITW(dependency.head.tag); IntTaggedWord head = new IntTaggedWord(dependency.head.word, tagBin(dependency.head.tag)); // dependency.head; IntTaggedWord arg = new IntTaggedWord(dependency.arg.word, tagBin(dependency.arg.tag)); // dependency.arg; boolean leftHeaded = dependency.leftHeaded; if (arg.word == STOP_WORD_INT) { stopCounter.incrementCount(intern(head, arg, leftHeaded, distBinDist), count); stopCounter.incrementCount(intern(headT, arg, leftHeaded, distBinDist), count); } if (wildForStop || arg.word != STOP_WORD_INT) { stopCounter.incrementCount(intern(head, wildTW, leftHeaded, distBinDist), count); stopCounter.incrementCount(intern(headT, wildTW, leftHeaded, distBinDist), count); } }
/** @param <T> */ public static <T> List<FeatureValue<T>> combine(Collection<FeatureValue<T>> featureValues) { ClassicCounter<T> counter = new ClassicCounter<T>(); for (FeatureValue<T> fv : featureValues) { counter.incrementCount(fv.name, fv.value); } Set<T> keys = new TreeSet<T>(counter.keySet()); List<FeatureValue<T>> featureList = new ArrayList<FeatureValue<T>>(keys.size()); for (T key : keys) { featureList.add(new FeatureValue<T>(key, counter.getCount(key))); } return featureList; }
public UnknownWordModel finishTraining() { // make sure the unseen counter isn't empty! If it is, put in // a uniform unseen over tags if (unSeenCounter.isEmpty()) { int numTags = tagIndex.size(); for (int tt = 0; tt < numTags; tt++) { if (!Lexicon.BOUNDARY_TAG.equals(tagIndex.get(tt))) { IntTaggedWord iT = new IntTaggedWord(nullWord, tt); IntTaggedWord i = NULL_ITW; unSeenCounter.incrementCount(iT); unSeenCounter.incrementCount(i); } } } // index the possible tags for each word // numWords = wordIndex.size(); // unknownWordIndex = wordIndex.indexOf(Lexicon.UNKNOWN_WORD, true); // initRulesWithWord(); return model; }
private void writeObject(ObjectOutputStream stream) throws IOException { // System.err.println("\nBefore compression:"); // System.err.println("arg size: " + argCounter.size() + " total: " + // argCounter.totalCount()); // System.err.println("stop size: " + stopCounter.size() + " total: " + // stopCounter.totalCount()); ClassicCounter<IntDependency> fullArgCounter = argCounter; argCounter = new ClassicCounter<IntDependency>(); for (IntDependency dependency : fullArgCounter.keySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { argCounter.incrementCount(dependency, fullArgCounter.getCount(dependency)); } } ClassicCounter<IntDependency> fullStopCounter = stopCounter; stopCounter = new ClassicCounter<IntDependency>(); for (IntDependency dependency : fullStopCounter.keySet()) { if (dependency.head.word != -1) { stopCounter.incrementCount(dependency, fullStopCounter.getCount(dependency)); } } // System.err.println("After compression:"); // System.err.println("arg size: " + argCounter.size() + " total: " + // argCounter.totalCount()); // System.err.println("stop size: " + stopCounter.size() + " total: " + // stopCounter.totalCount()); stream.defaultWriteObject(); argCounter = fullArgCounter; stopCounter = fullStopCounter; }
/** Trains this UWM on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 1.5) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); if (DOCUMENT_UNKNOWNS) { String wStr = wordIndex.get(iTW.word); String tStr = tagIndex.get(iTW.tag); String sStr = wordIndex.get(s); EncodingPrintWriter.err.println( "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8"); } IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); // rules.add(iTS); // sigs.add(iS); } // else { // if (seenCounter.getCount(iTW) < 2) { // it's a new tag for a known word // do nothing for now // } // } } }
@Override public void train(List<TaggedWord> sentence) { lex.train(sentence, 1.0); String last = null; for (TaggedWord tagLabel : sentence) { String tag = tagLabel.tag(); tagIndex.add(tag); if (last == null) { initial.incrementCount(tag); } else { ruleCounter.incrementCount2D(last, tag); } last = tag; } }
/** Adds the tagging with count to the data structures in this Lexicon. */ protected void addTagging(boolean seen, IntTaggedWord itw, double count) { if (seen) { seenCounter.incrementCount(itw, count); if (itw.tag() == nullTag) { words.add(itw); } else if (itw.word() == nullWord) { tags.add(itw); } else { // rules.add(itw); } } else { uwModel.addTagging(seen, itw, count); // if (itw.tag() == nullTag) { // sigs.add(itw); // } } }
private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) { // CharacterLevelTagExtender ext = new CharacterLevelTagExtender(); ClassicCounter<Integer> c = new ClassicCounter<Integer>(); for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) { Tree gold = (Tree) iterator.next(); StringBuilder goldChars = new StringBuilder(); ArrayList goldYield = gold.yield(); for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) { Word word = (Word) wordIter.next(); goldChars.append(word); } List<HasWord> ourWords = segment(goldChars.toString()); for (int i = 0; i < ourWords.size(); i++) { c.incrementCount(Integer.valueOf(ourWords.get(i).word().length())); } } return Distribution.getDistribution(c); }
public ClassicCounter<L> scoresOf(RVFDatum<L, F> example) { ClassicCounter<L> scores = new ClassicCounter<>(); Counters.addInPlace(scores, priors); if (addZeroValued) { Counters.addInPlace(scores, priorZero); } for (L l : labels) { double score = 0.0; Counter<F> features = example.asFeaturesCounter(); for (F f : features.keySet()) { int value = (int) features.getCount(f); score += weight(l, f, Integer.valueOf(value)); if (addZeroValued) { score -= weight(l, f, zero); } } scores.incrementCount(l, score); } return scores; }
/** * Collect counts for a non-STOP dependent. The dependency arg is still in the full tag space. * * @param dependency A non-stop dependency * @param valBinDist A binned distance * @param count The weight with which to add this dependency */ private void expandArg(IntDependency dependency, short valBinDist, double count) { IntTaggedWord headT = getCachedITW(dependency.head.tag); IntTaggedWord argT = getCachedITW(dependency.arg.tag); IntTaggedWord head = new IntTaggedWord(dependency.head.word, tagBin(dependency.head.tag)); // dependency.head; IntTaggedWord arg = new IntTaggedWord(dependency.arg.word, tagBin(dependency.arg.tag)); // dependency.arg; boolean leftHeaded = dependency.leftHeaded; // argCounter stores stuff in both the original and the reduced tag space??? argCounter.incrementCount(intern(head, arg, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headT, arg, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(head, argT, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headT, argT, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(head, wildTW, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headT, wildTW, leftHeaded, valBinDist), count); // the WILD head stats are always directionless and not useDistance! argCounter.incrementCount(intern(wildTW, arg, false, (short) -1), count); argCounter.incrementCount(intern(wildTW, argT, false, (short) -1), count); if (useSmoothTagProjection) { // added stuff to do more smoothing. CDM Jan 2007 IntTaggedWord headP = new IntTaggedWord(dependency.head.word, tagProject(dependency.head.tag)); IntTaggedWord headTP = new IntTaggedWord(ANY_WORD_INT, tagProject(dependency.head.tag)); IntTaggedWord argP = new IntTaggedWord(dependency.arg.word, tagProject(dependency.arg.tag)); IntTaggedWord argTP = new IntTaggedWord(ANY_WORD_INT, tagProject(dependency.arg.tag)); argCounter.incrementCount(intern(headP, argP, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headTP, argP, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headP, argTP, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headTP, argTP, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headP, wildTW, leftHeaded, valBinDist), count); argCounter.incrementCount(intern(headTP, wildTW, leftHeaded, valBinDist), count); // the WILD head stats are always directionless and not useDistance! argCounter.incrementCount(intern(wildTW, argP, false, (short) -1), count); argCounter.incrementCount(intern(wildTW, argTP, false, (short) -1), count); argCounter.incrementCount( intern(wildTW, new IntTaggedWord(dependency.head.word, ANY_TAG_INT), false, (short) -1), count); } numWordTokens++; }