public void trainC(ClassificationDataSet dataSet, ExecutorService threadPool) { if (dataSet.getClassSize() != 2) throw new FailedToFitException( "Logistic Regression works only in the case of two classes, and can not handle " + dataSet.getClassSize() + " classes"); RegressionDataSet rds = new RegressionDataSet(dataSet.getNumNumericalVars(), dataSet.getCategories()); for (int i = 0; i < dataSet.getSampleSize(); i++) { // getDataPointCategory will return either 0 or 1, so it works perfectly rds.addDataPoint(dataSet.getDataPoint(i), (double) dataSet.getDataPointCategory(i)); } train(rds, threadPool); }
public void train(RegressionDataSet dataSet, ExecutorService threadPool) { List<Vec> inputs = new ArrayList<Vec>(dataSet.getSampleSize()); for (int i = 0; i < dataSet.getSampleSize(); i++) inputs.add(dataSet.getDataPoint(i).getNumericalValues()); coefficents = new DenseVector(dataSet.getNumNumericalVars() + 1); Vec targetValues = dataSet.getTargetValues(); double minTarget = targetValues.min(); double maxTarget = targetValues.max(); shift = minTarget; scale = maxTarget - minTarget; // Now all values are in the range [0, 1] targetValues.subtract(shift); targetValues.mutableDivide(scale); Optimizer optimizer = new IterativelyReweightedLeastSquares(); coefficents = optimizer.optimize( 1e-5, 100, logitFun, logitFunD, coefficents, inputs, targetValues, threadPool); }
@Override public void train(final RegressionDataSet dataSet, final ExecutorService threadPool) { final PriorityQueue<RegressionModelEvaluation> bestModels = new PriorityQueue<RegressionModelEvaluation>( folds, new Comparator<RegressionModelEvaluation>() { @Override public int compare(RegressionModelEvaluation t, RegressionModelEvaluation t1) { double v0 = t.getScoreStats(regressionTargetScore).getMean(); double v1 = t1.getScoreStats(regressionTargetScore).getMean(); int order = regressionTargetScore.lowerIsBetter() ? 1 : -1; return order * Double.compare(v0, v1); } }); /** * Use this to keep track of which parameter we are altering. Index correspondence to the * parameter, and its value corresponds to which value has been used. Increment and carry counts * to iterate over all possible combinations. */ int[] setTo = new int[searchParams.size()]; /** * Each model is set to have different combination of parameters. We then train each model to * determine the best one. */ final List<Regressor> paramsToEval = new ArrayList<Regressor>(); while (true) { setParameters(setTo); paramsToEval.add(baseRegressor.clone()); if (incrementCombination(setTo)) break; } /* * This is the Executor used for training the models in parallel. If we * are not supposed to do that, it will be an executor that executes * them sequentually. */ final ExecutorService modelService; if (trainModelsInParallel) modelService = threadPool; else modelService = new FakeExecutor(); final CountDownLatch latch; // used for stopping in both cases // if we are doing our CV splits ahead of time, get them done now final List<RegressionDataSet> preFolded; /** Pre-combine our training combinations so that any caching can be re-used */ final List<RegressionDataSet> trainCombinations; if (reuseSameCVFolds) { preFolded = dataSet.cvSet(folds); trainCombinations = new ArrayList<RegressionDataSet>(preFolded.size()); for (int i = 0; i < preFolded.size(); i++) trainCombinations.add(RegressionDataSet.comineAllBut(preFolded, i)); } else { preFolded = null; trainCombinations = null; } boolean considerWarm = useWarmStarts && baseRegressor instanceof WarmRegressor; /** * make sure we don't do a warm start if its only supported when trained on the same data but we * aren't reuse-ing the same CV splits So we get the truth table * * <p>a | b | (a&&b)||¬a T | T | T T | F | F F | T | T F | F | T * * <p>where a = warmFromSameDataOnly and b = reuseSameSplit So we can instead use ¬ a || b */ if (considerWarm && (!((WarmRegressor) baseRegressor).warmFromSameDataOnly() || reuseSameCVFolds)) { /* we want all of the first parameter (which is the warm paramter, * taken care of for us) values done in a group. So We can get this * by just dividing up the larger list into sub lists, each sub list * is adjacent in the original and is the number of parameter values * we wanted to try */ int stepSize = searchValues.get(0).size(); int totalJobs = paramsToEval.size() / stepSize; latch = new CountDownLatch(totalJobs); for (int startPos = 0; startPos < paramsToEval.size(); startPos += stepSize) { final List<Regressor> subSet = paramsToEval.subList(startPos, startPos + stepSize); modelService.submit( new Runnable() { @Override public void run() { Regressor[] prevModels = null; for (Regressor r : subSet) { RegressionModelEvaluation rme = trainModelsInParallel ? new RegressionModelEvaluation(r, dataSet) : new RegressionModelEvaluation(r, dataSet, threadPool); rme.setKeepModels(true); // we need these to do warm starts! rme.setWarmModels(prevModels); rme.addScorer(regressionTargetScore.clone()); if (reuseSameCVFolds) rme.evaluateCrossValidation(preFolded, trainCombinations); else rme.evaluateCrossValidation(folds); prevModels = rme.getKeptModels(); synchronized (bestModels) { bestModels.add(rme); } } latch.countDown(); } }); } } else // regular CV, train a new model from scratch at every step { latch = new CountDownLatch(paramsToEval.size()); for (final Regressor toTrain : paramsToEval) { modelService.submit( new Runnable() { @Override public void run() { RegressionModelEvaluation rme = trainModelsInParallel ? new RegressionModelEvaluation(toTrain, dataSet) : new RegressionModelEvaluation(toTrain, dataSet, threadPool); rme.addScorer(regressionTargetScore.clone()); if (reuseSameCVFolds) rme.evaluateCrossValidation(preFolded, trainCombinations); else rme.evaluateCrossValidation(folds); synchronized (bestModels) { bestModels.add(rme); } latch.countDown(); } }); } } try { latch.await(); // Now we know the best classifier, we need to train one on the whole data set. Regressor bestRegressor = bestModels.peek().getRegressor(); // Just re-train it on the whole set if (trainFinalModel) { // try and warm start the final model if we can if (useWarmStarts && bestRegressor instanceof WarmRegressor && !((WarmRegressor) bestRegressor) .warmFromSameDataOnly()) // last line here needed to make sure we can do this warm // train { WarmRegressor wr = (WarmRegressor) bestRegressor; if (threadPool instanceof FakeExecutor) wr.train(dataSet, wr.clone()); else wr.train(dataSet, wr.clone(), threadPool); } else { if (threadPool instanceof FakeExecutor) bestRegressor.train(dataSet); else bestRegressor.train(dataSet, threadPool); } } trainedRegressor = bestRegressor; } catch (InterruptedException ex) { Logger.getLogger(GridSearch.class.getName()).log(Level.SEVERE, null, ex); } }