private void computeEmpiricalStatistics(List<F> geFeatures) { // allocate memory to the containers and initialize them geFeature2EmpiricalDist = new double[geFeatures.size()][labeledDataset.labelIndex.size()]; geFeature2DatumList = new ArrayList<List<Integer>>(geFeatures.size()); Map<F, Integer> geFeatureMap = Generics.newHashMap(); Set<Integer> activeUnlabeledExamples = Generics.newHashSet(); for (int n = 0; n < geFeatures.size(); n++) { F geFeature = geFeatures.get(n); geFeature2DatumList.add(new ArrayList<Integer>()); Arrays.fill(geFeature2EmpiricalDist[n], 0); geFeatureMap.put(geFeature, n); } // compute the empirical label distribution for each GE feature for (int i = 0; i < labeledDataset.size(); i++) { Datum<L, F> datum = labeledDataset.getDatum(i); int labelID = labeledDataset.labelIndex.indexOf(datum.label()); for (F feature : datum.asFeatures()) { if (geFeatureMap.containsKey(feature)) { int geFnum = geFeatureMap.get(feature); geFeature2EmpiricalDist[geFnum][labelID]++; } } } // now normalize and smooth the label distribution for each feature. for (int n = 0; n < geFeatures.size(); n++) { ArrayMath.normalize(geFeature2EmpiricalDist[n]); smoothDistribution(geFeature2EmpiricalDist[n]); } // now build the inverted index from each GE feature to unlabeled datums that contain it. for (int i = 0; i < unlabeledDataList.size(); i++) { Datum<L, F> datum = unlabeledDataList.get(i); for (F feature : datum.asFeatures()) { if (geFeatureMap.containsKey(feature)) { int geFnum = geFeatureMap.get(feature); geFeature2DatumList.get(geFnum).add(i); activeUnlabeledExamples.add(i); } } } System.out.println("Number of active unlabeled examples:" + activeUnlabeledExamples.size()); }
private void updateDerivative( Datum<L, F> datum, double[] probs, Counter<Triple<Integer, Integer, Integer>> feature2classPairDerivatives) { for (F feature : datum.asFeatures()) { int fID = labeledDataset.featureIndex.indexOf(feature); if (fID >= 0) { for (int c = 0; c < numClasses; c++) { for (int cPrime = 0; cPrime < numClasses; cPrime++) { if (cPrime == c) { feature2classPairDerivatives.incrementCount( new Triple<Integer, Integer, Integer>(fID, c, cPrime), -probs[c] * (1 - probs[c]) * valueOfFeature(feature, datum)); } else { feature2classPairDerivatives.incrementCount( new Triple<Integer, Integer, Integer>(fID, c, cPrime), probs[c] * probs[cPrime] * valueOfFeature(feature, datum)); } } } } } }
public Classifier<L, F> trainClassifier(Iterable<Datum<L, F>> dataIterable) { Minimizer<DiffFunction> minimizer = getMinimizer(); Index<F> featureIndex = Generics.newIndex(); Index<L> labelIndex = Generics.newIndex(); for (Datum<L, F> d : dataIterable) { labelIndex.add(d.label()); featureIndex.addAll(d.asFeatures()); // If there are duplicates, it doesn't add them again. } System.err.println( String.format( "Training linear classifier with %d features and %d labels", featureIndex.size(), labelIndex.size())); LogConditionalObjectiveFunction<L, F> objective = new LogConditionalObjectiveFunction<L, F>(dataIterable, logPrior, featureIndex, labelIndex); objective.setPrior(new LogPrior(LogPrior.LogPriorType.QUADRATIC)); double[] initial = objective.initial(); double[] weights = minimizer.minimize(objective, TOL, initial); LinearClassifier<L, F> classifier = new LinearClassifier<L, F>(objective.to2D(weights), featureIndex, labelIndex); return classifier; }