/** * Evaluates how many words (= terminals) in a collection of trees are covered by the lexicon. * First arg is the collection of trees; second through fourth args get the results. Currently * unused; this probably only works if train and test at same time so tags and words variables are * initialized. */ public double evaluateCoverage( Collection<Tree> trees, Set<String> missingWords, Set<String> missingTags, Set<IntTaggedWord> missingTW) { List<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>(); for (Tree t : trees) { iTW1.addAll(treeToEvents(t)); } int total = 0; int unseen = 0; for (IntTaggedWord itw : iTW1) { total++; if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) { missingWords.add(wordIndex.get(itw.word())); } if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) { missingTags.add(tagIndex.get(itw.tag())); } // if (!rules.contains(itw)) { if (seenCounter.getCount(itw) == 0.0) { unseen++; missingTW.add(itw); } } return (double) unseen / total; }
/** * Writes out data from this Object to the Writer w. Rules are separated by newline, and rule * elements are delimited by \t. */ public void writeData(Writer w) throws IOException { PrintWriter out = new PrintWriter(w); for (IntTaggedWord itw : seenCounter.keySet()) { out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " SEEN " + seenCounter.getCount(itw)); } for (IntTaggedWord itw : getUnknownWordModel().unSeenCounter().keySet()) { out.println( itw.toLexicalEntry(wordIndex, tagIndex) + " UNSEEN " + getUnknownWordModel().unSeenCounter().getCount(itw)); } for (int i = 0; i < smooth.length; i++) { out.println("smooth[" + i + "] = " + smooth[i]); } out.flush(); }
/** Adds the tagging with count to the data structures in this Lexicon. */ protected void addTagging(boolean seen, IntTaggedWord itw, double count) { if (seen) { seenCounter.incrementCount(itw, count); if (itw.tag() == nullTag) { words.add(itw); } else if (itw.word() == nullWord) { tags.add(itw); } else { // rules.add(itw); } } else { uwModel.addTagging(seen, itw, count); // if (itw.tag() == nullTag) { // sigs.add(itw); // } } }
protected void initRulesWithWord() { if (testOptions.verbose || DEBUG_LEXICON) { System.err.print("\nInitializing lexicon scores ... "); } // int numWords = words.size()+sigs.size()+1; int unkWord = wordIndex.indexOf(UNKNOWN_WORD, true); int numWords = wordIndex.size(); rulesWithWord = new List[numWords]; for (int w = 0; w < numWords; w++) { rulesWithWord[w] = new ArrayList<IntTaggedWord>(1); // most have 1 or 2 // items in them } // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) { tags = new HashSet<IntTaggedWord>(); for (IntTaggedWord iTW : seenCounter.keySet()) { if (iTW.word() == nullWord && iTW.tag() != nullTag) { tags.add(iTW); } } // tags for unknown words if (DEBUG_LEXICON) { System.err.println( "Lexicon initializing tags for UNKNOWN WORD (" + Lexicon.UNKNOWN_WORD + ", " + unkWord + ')'); } if (DEBUG_LEXICON) System.err.println("unSeenCounter is: " + uwModel.unSeenCounter()); if (DEBUG_LEXICON) System.err.println( "Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold); for (IntTaggedWord iT : tags) { if (DEBUG_LEXICON) System.err.println("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT)); double types = uwModel.unSeenCounter().getCount(iT); if (types > trainOptions.openClassTypesThreshold) { // Number of types before it's treated as open class IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag); rulesWithWord[iTW.word].add(iTW); } } if (testOptions.verbose || DEBUG_LEXICON) { System.err.print("The " + rulesWithWord[unkWord].size() + " open class tags are: ["); for (IntTaggedWord item : rulesWithWord[unkWord]) { System.err.print(" " + tagIndex.get(item.tag())); if (DEBUG_LEXICON) { IntTaggedWord iTprint = new IntTaggedWord(nullWord, item.tag); System.err.print( " (tag " + item.tag() + ", type count is " + uwModel.unSeenCounter().getCount(iTprint) + ')'); } } System.err.println(" ] "); } for (IntTaggedWord iTW : seenCounter.keySet()) { if (iTW.tag() != nullTag && iTW.word() != nullWord) { rulesWithWord[iTW.word].add(iTW); } } }
/** * Do max language model markov segmentation. Note that this algorithm inherently tags words as it * goes, but that we throw away the tags in the final result so that the segmented words are * untagged. (Note: for a couple of years till Aug 2007, a tagged result was returned, but this * messed up the parser, because it could use no tagging but the given tagging, which often wasn't * very good. Or in particular it was a subcategorized tagging which never worked with the current * forceTags option which assumes that gold taggings are inherently basic taggings.) * * @param s A String to segment * @return The list of segmented words. */ private ArrayList<HasWord> segmentWordsWithMarkov(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 int numTags = POSes.size(); // score of span with initial word of this tag double[][][] scores = new double[length][length + 1][numTags]; // best (length of) first word for this span with this tag int[][][] splitBacktrace = new int[length][length + 1][numTags]; // best tag for second word over this span, if first is this tag int[][][] POSbacktrace = new int[length][length + 1][numTags]; for (int i = 0; i < length; i++) { for (int j = 0; j < length + 1; j++) { Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY); } } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); for (String tag : POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double score = lex.score(itw, 0, word, null); if (start == 0) { score += Math.log(initialPOSDist.probabilityOf(tag)); } scores[start][end][itw.tag()] = score; splitBacktrace[start][end][itw.tag()] = end; } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { for (String tag : POSes) { int tagNum = tagIndex.indexOf(tag, true); if (splitBacktrace[start][split][tagNum] != split) { continue; } Distribution<String> rTagDist = markovPOSDists.get(tag); if (rTagDist == null) { continue; // this happens with "*" POS } for (String rTag : POSes) { int rTagNum = tagIndex.indexOf(rTag, true); double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.log(rTagDist.probabilityOf(rTag)); if (newScore > scores[start][end][tagNum]) { scores[start][end][tagNum] = newScore; splitBacktrace[start][end][tagNum] = split; POSbacktrace[start][end][tagNum] = rTagNum; } } } } } } int nextPOS = ArrayMath.argmax(scores[0][length]); ArrayList<HasWord> words = new ArrayList<HasWord>(); int start = 0; while (start < length) { int split = splitBacktrace[start][length][nextPOS]; StringBuilder wordBuf = new StringBuilder(); for (int i = start; i < split; i++) { wordBuf.append(s.charAt(i)); } String word = wordBuf.toString(); // String tag = tagIndex.get(nextPOS); // words.add(new TaggedWord(word, tag)); words.add(new Word(word)); if (split < length) { nextPOS = POSbacktrace[start][length][nextPOS]; } start = split; } return words; }
// CDM 2007: I wonder what this does differently from segmentWordsWithMarkov??? private ArrayList<TaggedWord> basicSegmentWords(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 // best score of span double[][] scores = new double[length][length + 1]; // best (last index of) first word for this span int[][] splitBacktrace = new int[length][length + 1]; // best tag for word over this span int[][] POSbacktrace = new int[length][length + 1]; for (int i = 0; i < length; i++) { Arrays.fill(scores[i], Double.NEGATIVE_INFINITY); } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); // for (String tag : POSes) { // 1.5 for (Iterator<String> iter = POSes.iterator(); iter.hasNext(); ) { String tag = iter.next(); IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double newScore = lex.score(itw, 0, word, null) + Math.log(lex.getPOSDistribution().probabilityOf(tag)); if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = end; POSbacktrace[start][end] = itw.tag(); } } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { if (splitBacktrace[start][split] != split) { continue; // only consider words on left } double newScore = scores[start][split] + scores[split][end]; if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = split; } } } } List<TaggedWord> words = new ArrayList<TaggedWord>(); int start = 0; while (start < length) { int end = splitBacktrace[start][length]; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); String tag = tagIndex.get(POSbacktrace[start][end]); words.add(new TaggedWord(word, tag)); start = end; } return new ArrayList<TaggedWord>(words); }