/** * Returns a list of featured thresholded by minPrecision and sorted by their frequency of * occurrence. precision in this case, is defined as the frequency of majority label over total * frequency for that feature. * * @return list of high precision features. */ private List<F> getHighPrecisionFeatures( GeneralDataset<L, F> dataset, double minPrecision, int maxNumFeatures) { int[][] feature2label = new int[dataset.numFeatures()][dataset.numClasses()]; for (int f = 0; f < dataset.numFeatures(); f++) Arrays.fill(feature2label[f], 0); int[][] data = dataset.data; int[] labels = dataset.labels; for (int d = 0; d < data.length; d++) { int label = labels[d]; // System.out.println("datum id:"+d+" label id: "+label); if (data[d] != null) { // System.out.println(" number of features:"+data[d].length); for (int n = 0; n < data[d].length; n++) { feature2label[data[d][n]][label]++; } } } Counter<F> feature2freq = new ClassicCounter<F>(); for (int f = 0; f < dataset.numFeatures(); f++) { int maxF = ArrayMath.max(feature2label[f]); int total = ArrayMath.sum(feature2label[f]); double precision = ((double) maxF) / total; F feature = dataset.featureIndex.get(f); if (precision >= minPrecision) { feature2freq.incrementCount(feature, total); } } if (feature2freq.size() > maxNumFeatures) { Counters.retainTop(feature2freq, maxNumFeatures); } // for(F feature : feature2freq.keySet()) // System.out.println(feature+" "+feature2freq.getCount(feature)); // System.exit(0); return Counters.toSortedList(feature2freq); }
public LinearClassifier<L, F> trainClassifier(GeneralDataset<L, F> dataset, double[] initial) { if (dataset instanceof RVFDataset) ((RVFDataset<L, F>) dataset).ensureRealValues(); double[][] weights = trainWeights(dataset, initial, false); LinearClassifier<L, F> classifier = new LinearClassifier<L, F>(weights, dataset.featureIndex(), dataset.labelIndex()); return classifier; }
/** IMPORTANT: dataset and biasedDataset must have same featureIndex, labelIndex */ public Classifier<L, F> trainClassifierSemiSup( GeneralDataset<L, F> data, GeneralDataset<L, F> biasedData, double[][] confusionMatrix, double[] initial) { double[][] weights = trainWeightsSemiSup(data, biasedData, confusionMatrix, initial); LinearClassifier<L, F> classifier = new LinearClassifier<L, F>(weights, data.featureIndex(), data.labelIndex()); return classifier; }
/** * Train a classifier with a sigma tuned on a validation set. In this case we are fitting on the * last 30% of the training data. * * @param train The data to train (and validate) on. * @return The constructed classifier */ public LinearClassifier<L, F> trainClassifierV( GeneralDataset<L, F> train, double min, double max, boolean accuracy) { labelIndex = train.labelIndex(); featureIndex = train.featureIndex(); tuneSigmaHeldOut = true; this.min = min; this.max = max; heldOutSetSigma(train); double[][] weights = trainWeights(train); return new LinearClassifier<L, F>(weights, train.featureIndex(), train.labelIndex()); }
public Classifier<L, F> trainClassifier( GeneralDataset<L, F> dataset, float[] dataWeights, LogPrior prior) { Minimizer<DiffFunction> minimizer = getMinimizer(); if (dataset instanceof RVFDataset) ((RVFDataset<L, F>) dataset).ensureRealValues(); LogConditionalObjectiveFunction<L, F> objective = new LogConditionalObjectiveFunction<L, F>(dataset, dataWeights, logPrior); double[] initial = objective.initial(); double[] weights = minimizer.minimize(objective, TOL, initial); LinearClassifier<L, F> classifier = new LinearClassifier<L, F>( objective.to2D(weights), dataset.featureIndex(), dataset.labelIndex()); return classifier; }
/** * Trains the linear classifier using Generalized Expectation criteria as described in * <tt>Generalized Expectation Criteria for Semi Supervised Learning of Conditional Random * Fields</tt>, Mann and McCallum, ACL 2008. The original algorithm is proposed for CRFs but has * been adopted to LinearClassifier (which is a simpler special case of a CRF). IMPORTANT: the * labeled features that are passed as an argument are assumed to be binary valued, although other * features are allowed to be real valued. */ public LinearClassifier<L, F> trainSemiSupGE( GeneralDataset<L, F> labeledDataset, List<? extends Datum<L, F>> unlabeledDataList, List<F> GEFeatures, double convexComboCoeff) { Minimizer<DiffFunction> minimizer = minimizerCreator.create(); LogConditionalObjectiveFunction<L, F> objective = new LogConditionalObjectiveFunction<L, F>( labeledDataset, new LogPrior(LogPrior.LogPriorType.NULL)); GeneralizedExpectationObjectiveFunction<L, F> geObjective = new GeneralizedExpectationObjectiveFunction<L, F>( labeledDataset, unlabeledDataList, GEFeatures); SemiSupervisedLogConditionalObjectiveFunction semiSupObjective = new SemiSupervisedLogConditionalObjectiveFunction( objective, geObjective, null, convexComboCoeff); double[] initial = objective.initial(); double[] weights = minimizer.minimize(semiSupObjective, TOL, initial); return new LinearClassifier<L, F>( objective.to2D(weights), labeledDataset.featureIndex(), labeledDataset.labelIndex()); }
public double[] heldOutSetSigma(GeneralDataset<L, F> train, Scorer<L> scorer) { Pair<GeneralDataset<L, F>, GeneralDataset<L, F>> data = train.split(0.3); return heldOutSetSigma(data.first(), data.second(), scorer); }