private void computeEmpiricalStatistics(List<F> geFeatures) { // allocate memory to the containers and initialize them geFeature2EmpiricalDist = new double[geFeatures.size()][labeledDataset.labelIndex.size()]; geFeature2DatumList = new ArrayList<List<Integer>>(geFeatures.size()); Map<F, Integer> geFeatureMap = Generics.newHashMap(); Set<Integer> activeUnlabeledExamples = Generics.newHashSet(); for (int n = 0; n < geFeatures.size(); n++) { F geFeature = geFeatures.get(n); geFeature2DatumList.add(new ArrayList<Integer>()); Arrays.fill(geFeature2EmpiricalDist[n], 0); geFeatureMap.put(geFeature, n); } // compute the empirical label distribution for each GE feature for (int i = 0; i < labeledDataset.size(); i++) { Datum<L, F> datum = labeledDataset.getDatum(i); int labelID = labeledDataset.labelIndex.indexOf(datum.label()); for (F feature : datum.asFeatures()) { if (geFeatureMap.containsKey(feature)) { int geFnum = geFeatureMap.get(feature); geFeature2EmpiricalDist[geFnum][labelID]++; } } } // now normalize and smooth the label distribution for each feature. for (int n = 0; n < geFeatures.size(); n++) { ArrayMath.normalize(geFeature2EmpiricalDist[n]); smoothDistribution(geFeature2EmpiricalDist[n]); } // now build the inverted index from each GE feature to unlabeled datums that contain it. for (int i = 0; i < unlabeledDataList.size(); i++) { Datum<L, F> datum = unlabeledDataList.get(i); for (F feature : datum.asFeatures()) { if (geFeatureMap.containsKey(feature)) { int geFnum = geFeatureMap.get(feature); geFeature2DatumList.get(geFnum).add(i); activeUnlabeledExamples.add(i); } } } System.out.println("Number of active unlabeled examples:" + activeUnlabeledExamples.size()); }
private static void smoothDistribution(double[] dist) { // perform Laplace smoothing double epsilon = 1e-6; for (int i = 0; i < dist.length; i++) dist[i] += epsilon; ArrayMath.normalize(dist); }