/** * Populate the FeatureVector with Bag of Words. * * @param c * @param fv */ protected void populateFV(String text, FeatureVector<E> fv) { List<String> unnormalized = tokenizer.tokenize(text); Multiset<String> terms = HashMultiset.create(); for (String token : unnormalized) { String norm = Util.normalize(token); if (!norm.isEmpty()) { terms.add(norm); } } // sparse representation... no need to put in 0's for (String term : terms.elementSet()) { // rare words don't get included, so check first if (!integerFeatureNames && train.getMetadata().containsKey(term)) { DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term)); fv.put(term, bagFeat); } else if (integerFeatureNames && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) { String featureName = String.valueOf(wordIndexMap.get(term)); DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term)); fv.put(featureName, bagFeat); } } }
/** * Get the training data. * * @return */ public TrainRelation<E> getTrainingData() { TrainRelation<E> copy = new TrainRelation<E>(train.getName(), (Metadata) train.getMetadata().clone()); copy.addAll(train); return copy; }
/** * Get the training data. * * @return */ public TrainRelation<E> getTrainingData(E clazz) { TrainRelation<E> subRelation = new TrainRelation<E>("sub-relation", (Metadata) train.getMetadata().clone()); for (LabeledFeatureVector<E> lfv : train) { if (lfv.getLabel().equals(clazz)) { subRelation.add(lfv); } } return subRelation; }
/** * Train the classifier with this instance. * * @param c * @param clazz */ public void train(String id, String text, E clazz) { if (!trained.contains(id)) { LabeledFeatureVector<E> lfv = createLabeledFV(id, text, clazz); train.add(lfv); trained.add(id); } }
/** * Train the classifier with this instance. * * @param fv * @param clazz */ public void train(FeatureVector<E> fv, E clazz) { if (!trained.contains(fv.getId())) { LabeledFeatureVector<E> lfv = new LabeledFeatureVector<E>(clazz, fv.getId()); lfv.putAll(fv); lfv.setQid(fv.getQid()); lfv.setRank(fv.getRank()); train.add(lfv); trained.add(fv.getId()); } }
/** * Create the Bag of Words features. * * @param citations */ public void createFeatures(Collection<String> documents) { Multiset<String> terms = HashMultiset.create(); for (String s : documents) { List<String> unnormalized = tokenizer.tokenize(s); // normalize them for (int i = 0; i < unnormalized.size(); i++) { String u = unnormalized.get(i); String norm = Util.normalize(u); if (!norm.isEmpty()) { terms.add(norm); } if (bigrams && (i < unnormalized.size() - 1)) { String second = unnormalized.get(i + 1); String normSecond = Util.normalize(second); if (!normSecond.isEmpty()) { terms.add(norm + "_" + normSecond); } } } } int i = 0; for (String term : terms.elementSet()) { if (terms.count(term) >= minOccurs // don't count infreq. words && term.length() >= minLength) { // or super short words if (!integerFeatureNames) { train.getMetadata().put(term, "boolean"); } else { wordIndexMap.put(term, i++); train.getMetadata().put(String.valueOf(i), "boolean"); } } } }
/** Clear the training set. */ public void clearTrainingSet() { train.clear(); trained.clear(); }