/** * Populate the FeatureVector with Bag of Words. * * @param c * @param fv */ protected void populateFV(String text, FeatureVector<E> fv) { List<String> unnormalized = tokenizer.tokenize(text); Multiset<String> terms = HashMultiset.create(); for (String token : unnormalized) { String norm = Util.normalize(token); if (!norm.isEmpty()) { terms.add(norm); } } // sparse representation... no need to put in 0's for (String term : terms.elementSet()) { // rare words don't get included, so check first if (!integerFeatureNames && train.getMetadata().containsKey(term)) { DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term)); fv.put(term, bagFeat); } else if (integerFeatureNames && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) { String featureName = String.valueOf(wordIndexMap.get(term)); DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term)); fv.put(featureName, bagFeat); } } }