@Test public void testTrainC_ClassificationDataSetMissingFeat() { System.out.println("trainC"); for (boolean useCatFeatures : new boolean[] {true, false}) { RandomForest instance = new RandomForest(); ClassificationDataSet train = FixedProblems.getCircles(1000, 1.0, 10.0, 100.0); // RF may not get boundry perfect, so use noiseless for testing ClassificationDataSet test = FixedProblems.getCircles(1000, 0.0, new XORWOW(), 1.0, 10.0, 100.0); train.applyTransform(new InsertMissingValuesTransform(0.1)); test.applyTransform(new InsertMissingValuesTransform(0.01)); ClassificationModelEvaluation cme = new ClassificationModelEvaluation(instance, train); if (useCatFeatures) cme.setDataTransformProcess( new DataTransformProcess( new NumericalToHistogram.NumericalToHistogramTransformFactory())); cme.evaluateTestSet(test); if (useCatFeatures) // hard to get right with only 2 features like this assertTrue(cme.getErrorRate() <= 0.17); else assertTrue(cme.getErrorRate() <= 0.1); } }
public void trainC(ClassificationDataSet dataSet, ExecutorService threadPool) { if (dataSet.getClassSize() != 2) throw new FailedToFitException( "Logistic Regression works only in the case of two classes, and can not handle " + dataSet.getClassSize() + " classes"); RegressionDataSet rds = new RegressionDataSet(dataSet.getNumNumericalVars(), dataSet.getCategories()); for (int i = 0; i < dataSet.getSampleSize(); i++) { // getDataPointCategory will return either 0 or 1, so it works perfectly rds.addDataPoint(dataSet.getDataPoint(i), (double) dataSet.getDataPointCategory(i)); } train(rds, threadPool); }
@Override public void trainC(ClassificationDataSet dataSet, ExecutorService threadPool) { final int models = baseClassifiers.size(); final int C = dataSet.getClassSize(); weightsPerModel = C == 2 ? 1 : C; ClassificationDataSet metaSet = new ClassificationDataSet( weightsPerModel * models, new CategoricalData[0], dataSet.getPredicting()); List<ClassificationDataSet> dataFolds = dataSet.cvSet(folds); // iterate in the order of the folds so we get the right dataum weights for (ClassificationDataSet cds : dataFolds) for (int i = 0; i < cds.getSampleSize(); i++) metaSet.addDataPoint( new DenseVector(weightsPerModel * models), cds.getDataPointCategory(i), cds.getDataPoint(i).getWeight()); // create the meta training set for (int c = 0; c < baseClassifiers.size(); c++) { Classifier cl = baseClassifiers.get(c); int pos = 0; for (int f = 0; f < dataFolds.size(); f++) { ClassificationDataSet train = ClassificationDataSet.comineAllBut(dataFolds, f); ClassificationDataSet test = dataFolds.get(f); if (threadPool == null) cl.trainC(train); else cl.trainC(train, threadPool); for (int i = 0; i < test.getSampleSize(); i++) // evaluate and mark each point in the held out fold. { CategoricalResults pred = cl.classify(test.getDataPoint(i)); if (C == 2) metaSet.getDataPoint(pos).getNumericalValues().set(c, pred.getProb(0) * 2 - 1); else { Vec toSet = metaSet.getDataPoint(pos).getNumericalValues(); for (int j = weightsPerModel * c; j < weightsPerModel * (c + 1); j++) toSet.set(j, pred.getProb(j - weightsPerModel * c)); } pos++; } } } // train the meta model if (threadPool == null) aggregatingClassifier.trainC(metaSet); else aggregatingClassifier.trainC(metaSet, threadPool); // train the final classifiers, unless folds=1. In that case they are already trained if (folds != 1) { for (Classifier cl : baseClassifiers) if (threadPool == null) cl.trainC(dataSet); else cl.trainC(dataSet, threadPool); } }
@Test public void testClone() { System.out.println("clone"); for (boolean useCatFeatures : new boolean[] {true, false}) { RandomForest instance = new RandomForest(); ClassificationDataSet t1 = FixedProblems.getSimpleKClassLinear(100, 3); ClassificationDataSet t2 = FixedProblems.getSimpleKClassLinear(100, 6); if (useCatFeatures) { t1.applyTransform(new NumericalToHistogram(t1)); t2.applyTransform(new NumericalToHistogram(t2)); } instance = instance.clone(); instance.trainC(t1); RandomForest result = instance.clone(); for (int i = 0; i < t1.getSampleSize(); i++) assertEquals(t1.getDataPointCategory(i), result.classify(t1.getDataPoint(i)).mostLikely()); result.trainC(t2); for (int i = 0; i < t1.getSampleSize(); i++) assertEquals( t1.getDataPointCategory(i), instance.classify(t1.getDataPoint(i)).mostLikely()); for (int i = 0; i < t2.getSampleSize(); i++) assertEquals(t2.getDataPointCategory(i), result.classify(t2.getDataPoint(i)).mostLikely()); } }
@Override public void trainC(final ClassificationDataSet dataSet, final ExecutorService threadPool) { final PriorityQueue<ClassificationModelEvaluation> bestModels = new PriorityQueue<ClassificationModelEvaluation>( folds, new Comparator<ClassificationModelEvaluation>() { @Override public int compare( ClassificationModelEvaluation t, ClassificationModelEvaluation t1) { double v0 = t.getScoreStats(classificationTargetScore).getMean(); double v1 = t1.getScoreStats(classificationTargetScore).getMean(); int order = classificationTargetScore.lowerIsBetter() ? 1 : -1; return order * Double.compare(v0, v1); } }); /** * Use this to keep track of which parameter we are altering. Index correspondence to the * parameter, and its value corresponds to which value has been used. Increment and carry counts * to iterate over all possible combinations. */ int[] setTo = new int[searchParams.size()]; /** * Each model is set to have different combination of parameters. We then train each model to * determine the best one. */ final List<Classifier> paramsToEval = new ArrayList<Classifier>(); while (true) { setParameters(setTo); paramsToEval.add(baseClassifier.clone()); if (incrementCombination(setTo)) break; } /* * This is the Executor used for training the models in parallel. If we * are not supposed to do that, it will be an executor that executes * them sequentually. */ final ExecutorService modelService; if (trainModelsInParallel) modelService = threadPool; else modelService = new FakeExecutor(); final CountDownLatch latch; // used for stopping in both cases // if we are doing our CV splits ahead of time, get them done now final List<ClassificationDataSet> preFolded; /** Pre-combine our training combinations so that any caching can be re-used */ final List<ClassificationDataSet> trainCombinations; if (reuseSameCVFolds) { preFolded = dataSet.cvSet(folds); trainCombinations = new ArrayList<ClassificationDataSet>(preFolded.size()); for (int i = 0; i < preFolded.size(); i++) trainCombinations.add(ClassificationDataSet.comineAllBut(preFolded, i)); } else { preFolded = null; trainCombinations = null; } boolean considerWarm = useWarmStarts && baseClassifier instanceof WarmClassifier; /** * make sure we don't do a warm start if its only supported when trained on the same data but we * aren't reuse-ing the same CV splits So we get the truth table * * <p>a | b | (a&&b)||¬a T | T | T T | F | F F | T | T F | F | T * * <p>where a = warmFromSameDataOnly and b = reuseSameSplit So we can instead use ¬ a || b */ if (considerWarm && (!((WarmClassifier) baseClassifier).warmFromSameDataOnly() || reuseSameCVFolds)) { /* we want all of the first parameter (which is the warm paramter, * taken care of for us) values done in a group. So We can get this * by just dividing up the larger list into sub lists, each sub list * is adjacent in the original and is the number of parameter values * we wanted to try */ int stepSize = searchValues.get(0).size(); int totalJobs = paramsToEval.size() / stepSize; latch = new CountDownLatch(totalJobs); for (int startPos = 0; startPos < paramsToEval.size(); startPos += stepSize) { final List<Classifier> subSet = paramsToEval.subList(startPos, startPos + stepSize); modelService.submit( new Runnable() { @Override public void run() { Classifier[] prevModels = null; for (Classifier c : subSet) { ClassificationModelEvaluation cme = trainModelsInParallel ? new ClassificationModelEvaluation(c, dataSet) : new ClassificationModelEvaluation(c, dataSet, threadPool); cme.setKeepModels(true); // we need these to do warm starts! cme.setWarmModels(prevModels); cme.addScorer(classificationTargetScore.clone()); if (reuseSameCVFolds) cme.evaluateCrossValidation(preFolded, trainCombinations); else cme.evaluateCrossValidation(folds); prevModels = cme.getKeptModels(); synchronized (bestModels) { bestModels.add(cme); } } latch.countDown(); } }); } } else // regular CV, train a new model from scratch at every step { latch = new CountDownLatch(paramsToEval.size()); for (final Classifier toTrain : paramsToEval) { modelService.submit( new Runnable() { @Override public void run() { ClassificationModelEvaluation cme = trainModelsInParallel ? new ClassificationModelEvaluation(toTrain, dataSet) : new ClassificationModelEvaluation(toTrain, dataSet, threadPool); cme.addScorer(classificationTargetScore.clone()); if (reuseSameCVFolds) cme.evaluateCrossValidation(preFolded, trainCombinations); else cme.evaluateCrossValidation(folds); synchronized (bestModels) { bestModels.add(cme); } latch.countDown(); } }); } } // now wait for everyone to finish try { latch.await(); // Now we know the best classifier, we need to train one on the whole data set. Classifier bestClassifier = bestModels.peek().getClassifier(); // Just re-train it on the whole set if (trainFinalModel) { // try and warm start the final model if we can if (useWarmStarts && bestClassifier instanceof WarmClassifier && !((WarmClassifier) bestClassifier) .warmFromSameDataOnly()) // last line here needed to make sure we can do this warm // train { WarmClassifier wc = (WarmClassifier) bestClassifier; if (threadPool instanceof FakeExecutor) wc.trainC(dataSet, wc.clone()); else wc.trainC(dataSet, wc.clone(), threadPool); } else { if (threadPool instanceof FakeExecutor) bestClassifier.trainC(dataSet); else bestClassifier.trainC(dataSet, threadPool); } } trainedClassifier = bestClassifier; } catch (InterruptedException ex) { Logger.getLogger(GridSearch.class.getName()).log(Level.SEVERE, null, ex); } }