/** * Populate the FeatureVector with Bag of Words. * * @param c * @param fv */ protected void populateFV(String text, FeatureVector<E> fv) { List<String> unnormalized = tokenizer.tokenize(text); Multiset<String> terms = HashMultiset.create(); for (String token : unnormalized) { String norm = Util.normalize(token); if (!norm.isEmpty()) { terms.add(norm); } } // sparse representation... no need to put in 0's for (String term : terms.elementSet()) { // rare words don't get included, so check first if (!integerFeatureNames && train.getMetadata().containsKey(term)) { DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term)); fv.put(term, bagFeat); } else if (integerFeatureNames && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) { String featureName = String.valueOf(wordIndexMap.get(term)); DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term)); fv.put(featureName, bagFeat); } } }
/** * Convert to the Mallet format. * * @param train * @return * @throws IOException */ public static InstanceList convert(Relation<?> train) { boolean isTrain = (train instanceof TrainRelation) ? true : false; StringBuffer sb = new StringBuffer(); for (FeatureVector<?> fv : train) { sb.append(fv.getId() + "\t"); if (isTrain) { sb.append(((LabeledFeatureVector<?>) fv).getLabel().toString() + "\t"); } else { sb.append("X\t"); } for (String feature : train.getMetadata().keySet()) { if (fv.containsKey(feature)) { sb.append(feature + "=" + fv.get(feature).getValue() + " "); } } sb.append("\n"); } InstanceList il = loadInstances(sb.toString()); return il; }
/** * Train the classifier with this instance. * * @param fv * @param clazz */ public void train(FeatureVector<E> fv, E clazz) { if (!trained.contains(fv.getId())) { LabeledFeatureVector<E> lfv = new LabeledFeatureVector<E>(clazz, fv.getId()); lfv.putAll(fv); lfv.setQid(fv.getQid()); lfv.setRank(fv.getRank()); train.add(lfv); trained.add(fv.getId()); } }
/** * Convert to the Mallet format. * * @param fv * @param m * @return */ public static Instance convert(FeatureVector<?> fv, Metadata m) { boolean isTrain = (fv instanceof LabeledFeatureVector) ? true : false; StringBuffer sb = new StringBuffer(fv.getId() + "\t"); if (isTrain) { sb.append(((LabeledFeatureVector<?>) fv).getLabel().toString() + "\t"); } else { sb.append("1\t"); } for (String feature : m.keySet()) { if (fv.containsKey(feature)) { sb.append(feature + "=" + fv.get(feature).getValue() + " "); } } InstanceList il = loadInstances(sb.toString()); return il.get(0); }