/** * Evaluates how many words (= terminals) in a collection of trees are covered by the lexicon. * First arg is the collection of trees; second through fourth args get the results. Currently * unused; this probably only works if train and test at same time so tags and words variables are * initialized. */ public double evaluateCoverage( Collection<Tree> trees, Set<String> missingWords, Set<String> missingTags, Set<IntTaggedWord> missingTW) { List<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>(); for (Tree t : trees) { iTW1.addAll(treeToEvents(t)); } int total = 0; int unseen = 0; for (IntTaggedWord itw : iTW1) { total++; if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) { missingWords.add(wordIndex.get(itw.word())); } if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) { missingTags.add(tagIndex.get(itw.tag())); } // if (!rules.contains(itw)) { if (seenCounter.getCount(itw) == 0.0) { unseen++; missingTW.add(itw); } } return (double) unseen / total; }
/** Adds the tagging with count to the data structures in this Lexicon. */ protected void addTagging(boolean seen, IntTaggedWord itw, double count) { if (seen) { seenCounter.incrementCount(itw, count); if (itw.tag() == nullTag) { words.add(itw); } else if (itw.word() == nullWord) { tags.add(itw); } else { // rules.add(itw); } } else { uwModel.addTagging(seen, itw, count); // if (itw.tag() == nullTag) { // sigs.add(itw); // } } }
protected void initRulesWithWord() { if (testOptions.verbose || DEBUG_LEXICON) { System.err.print("\nInitializing lexicon scores ... "); } // int numWords = words.size()+sigs.size()+1; int unkWord = wordIndex.indexOf(UNKNOWN_WORD, true); int numWords = wordIndex.size(); rulesWithWord = new List[numWords]; for (int w = 0; w < numWords; w++) { rulesWithWord[w] = new ArrayList<IntTaggedWord>(1); // most have 1 or 2 // items in them } // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) { tags = new HashSet<IntTaggedWord>(); for (IntTaggedWord iTW : seenCounter.keySet()) { if (iTW.word() == nullWord && iTW.tag() != nullTag) { tags.add(iTW); } } // tags for unknown words if (DEBUG_LEXICON) { System.err.println( "Lexicon initializing tags for UNKNOWN WORD (" + Lexicon.UNKNOWN_WORD + ", " + unkWord + ')'); } if (DEBUG_LEXICON) System.err.println("unSeenCounter is: " + uwModel.unSeenCounter()); if (DEBUG_LEXICON) System.err.println( "Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold); for (IntTaggedWord iT : tags) { if (DEBUG_LEXICON) System.err.println("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT)); double types = uwModel.unSeenCounter().getCount(iT); if (types > trainOptions.openClassTypesThreshold) { // Number of types before it's treated as open class IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag); rulesWithWord[iTW.word].add(iTW); } } if (testOptions.verbose || DEBUG_LEXICON) { System.err.print("The " + rulesWithWord[unkWord].size() + " open class tags are: ["); for (IntTaggedWord item : rulesWithWord[unkWord]) { System.err.print(" " + tagIndex.get(item.tag())); if (DEBUG_LEXICON) { IntTaggedWord iTprint = new IntTaggedWord(nullWord, item.tag); System.err.print( " (tag " + item.tag() + ", type count is " + uwModel.unSeenCounter().getCount(iTprint) + ')'); } } System.err.println(" ] "); } for (IntTaggedWord iTW : seenCounter.keySet()) { if (iTW.tag() != nullTag && iTW.word() != nullWord) { rulesWithWord[iTW.word].add(iTW); } } }