/** Accumulate statistics on how often each token value occurs */ public void train(StringWrapperIterator i) { Set seenTokens = new HashSet(); while (i.hasNext()) { BagOfTokens bag = asBagOfTokens(i.nextStringWrapper()); seenTokens.clear(); for (Iterator j = bag.tokenIterator(); j.hasNext(); ) { totalTokenCount++; Token tokj = (Token) j.next(); if (!seenTokens.contains(tokj)) { seenTokens.add(tokj); // increment documentFrequency counts Integer df = (Integer) documentFrequency.get(tokj); if (df == null) documentFrequency.put(tokj, ONE); else if (df == ONE) documentFrequency.put(tokj, TWO); else if (df == TWO) documentFrequency.put(tokj, THREE); else documentFrequency.put(tokj, new Integer(df.intValue() + 1)); } } collectionSize++; } }
public Iterator tokenIterator() { return documentFrequency.keySet().iterator(); }
public int getDocumentFrequency(Token tok) { Integer freqInteger = (Integer) documentFrequency.get(tok); if (freqInteger == null) return 0; else return freqInteger.intValue(); }