@Override protected DRFModel buildModel( DRFModel model, final Frame fr, String names[], String domains[][], final Timer t_build) { // Append number of trees participating in on-the-fly scoring fr.add("OUT_BAG_TREES", response.makeZero()); // The RNG used to pick split columns Random rand = createRNG(_seed); // Prepare working columns new SetWrkTask().doAll(fr); int tid; DTree[] ktrees = null; // Prepare tree statistics TreeStats tstats = new TreeStats(); // Build trees until we hit the limit for (tid = 0; tid < ntrees; tid++) { // Building tid-tree model = doScoring( model, fr, ktrees, tid, tstats, tid == 0, !hasValidation(), build_tree_one_node); // At each iteration build K trees (K = nclass = response column domain size) // TODO: parallelize more? build more than k trees at each time, we need to care about // temporary data // Idea: launch more DRF at once. Timer kb_timer = new Timer(); ktrees = buildNextKTrees(fr, _mtry, sample_rate, rand, tid); Log.info(Sys.DRF__, (tid + 1) + ". tree was built " + kb_timer.toString()); if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore // Check latest predictions tstats.updateBy(ktrees); } model = doScoring(model, fr, ktrees, tid, tstats, true, !hasValidation(), build_tree_one_node); // Make sure that we did not miss any votes assert !importance || _treeMeasuresOnOOB.npredictors() == _treeMeasuresOnSOOB[0 /*variable*/].npredictors() : "Missing some tree votes in variable importance voting?!"; return model; }
// Start by splitting all the data according to some criteria (minimize // variance at the leaves). Record on each row which split it goes to, and // assign a split number to it (for next pass). On *this* pass, use the // split-number to build a per-split histogram, with a per-histogram-bucket // variance. @Override protected GBMModel buildModel( GBMModel model, final Frame fr, String names[], String domains[][], Timer t_build) { // Tag out rows missing the response column new ExcludeNAResponse().doAll(fr); // Build trees until we hit the limit int tid; DTree[] ktrees = null; // Trees TreeStats tstats = new TreeStats(); // Tree stats for (tid = 0; tid < ntrees; tid++) { // During first iteration model contains 0 trees, then 0-trees, then 1-tree,... // BUT if validation is not specified model does not participate in voting // but on-the-fly computed data are used model = doScoring(model, fr, ktrees, tid, tstats, false, false, false); // ESL2, page 387 // Step 2a: Compute prediction (prob distribution) from prior tree results: // Work <== f(Tree) new ComputeProb().doAll(fr); // ESL2, page 387 // Step 2b i: Compute residuals from the prediction (probability distribution) // Work <== f(Work) new ComputeRes().doAll(fr); // ESL2, page 387, Step 2b ii, iii, iv Timer kb_timer = new Timer(); ktrees = buildNextKTrees(fr); Log.info(Sys.GBM__, (tid + 1) + ". tree was built in " + kb_timer.toString()); if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore // Check latest predictions tstats.updateBy(ktrees); } // Final scoring model = doScoring(model, fr, ktrees, tid, tstats, true, false, false); return model; }
@Override protected void buildModel() { // Start with class distribution as null-model // FIXME: Test/Investigate this // if( _nclass >= 2 ) { // for( int c=0; c<_nclass; c++ ) { // final double init = _model._output._priorClassDist[c]; // new MRTask() { // @Override public void map(Chunk tree) { for( int i=0; i<tree._len; i++ ) // tree.set(i, init); } // }.doAll(vec_tree(_train,c)); // } // } _mtry = (_parms._mtries == -1) ? // classification: mtry=sqrt(_ncols), regression: mtry=_ncols/3 (isClassifier() ? Math.max((int) Math.sqrt(_ncols), 1) : Math.max(_ncols / 3, 1)) : _parms._mtries; if (!(1 <= _mtry && _mtry <= _ncols)) throw new IllegalArgumentException( "Computed mtry should be in interval <1,#cols> but it is " + _mtry); // Initialize TreeVotes for classification, MSE arrays for regression initTreeMeasurements(); // Append number of trees participating in on-the-fly scoring _train.add("OUT_BAG_TREES", _response.makeZero()); // Prepare working columns new SetWrkTask().doAll(_train); // If there was a check point recompute tree_<_> and oob columns based on predictions from // previous trees // but only if OOB validation is requested. if (_parms._checkpoint) { Timer t = new Timer(); // Compute oob votes for each output level new OOBScorer( _ncols, _nclass, (hasWeights() ? 1 : 0) + (hasOffset() ? 1 : 0), _parms._sample_rate, _model._output._treeKeys) .doAll(_train); Log.info("Reconstructing oob stats from checkpointed model took " + t); } // The RNG used to pick split columns Random rand = createRNG(_parms._seed); // To be deterministic get random numbers for previous trees and // put random generator to the same state for (int i = 0; i < _ntreesFromCheckpoint; i++) rand.nextLong(); int tid; DTree[] ktrees = null; // Prepare tree statistics // Build trees until we hit the limit for (tid = 0; tid < _parms._ntrees; tid++) { // Building tid-tree if (tid != 0 || !_parms._checkpoint) { // do not make initial scoring if model already exist double training_r2 = doScoringAndSaveModel(false, true, _parms._build_tree_one_node); if (training_r2 >= _parms._r2_stopping) return; // Stop when approaching round-off error } // At each iteration build K trees (K = nclass = response column domain size) // TODO: parallelize more? build more than k trees at each time, we need to care about // temporary data // Idea: launch more DRF at once. Timer kb_timer = new Timer(); buildNextKTrees(_train, _mtry, _parms._sample_rate, rand, tid); Log.info((tid + 1) + ". tree was built " + kb_timer.toString()); DRF.this.update(1); if (!isRunning()) return; // If canceled during building, do not bulkscore } doScoringAndSaveModel(true, true, _parms._build_tree_one_node); }