// Start by splitting all the data according to some criteria (minimize // variance at the leaves). Record on each row which split it goes to, and // assign a split number to it (for next pass). On *this* pass, use the // split-number to build a per-split histogram, with a per-histogram-bucket // variance. @Override protected GBMModel buildModel( GBMModel model, final Frame fr, String names[], String domains[][], Timer t_build) { // Tag out rows missing the response column new ExcludeNAResponse().doAll(fr); // Build trees until we hit the limit int tid; DTree[] ktrees = null; // Trees TreeStats tstats = new TreeStats(); // Tree stats for (tid = 0; tid < ntrees; tid++) { // During first iteration model contains 0 trees, then 0-trees, then 1-tree,... // BUT if validation is not specified model does not participate in voting // but on-the-fly computed data are used model = doScoring(model, fr, ktrees, tid, tstats, false, false, false); // ESL2, page 387 // Step 2a: Compute prediction (prob distribution) from prior tree results: // Work <== f(Tree) new ComputeProb().doAll(fr); // ESL2, page 387 // Step 2b i: Compute residuals from the prediction (probability distribution) // Work <== f(Work) new ComputeRes().doAll(fr); // ESL2, page 387, Step 2b ii, iii, iv Timer kb_timer = new Timer(); ktrees = buildNextKTrees(fr); Log.info(Sys.GBM__, (tid + 1) + ". tree was built in " + kb_timer.toString()); if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore // Check latest predictions tstats.updateBy(ktrees); } // Final scoring model = doScoring(model, fr, ktrees, tid, tstats, true, false, false); return model; }
// -------------------------------------------------------------------------- // Build the next k-trees, which is trying to correct the residual error from // the prior trees. From LSE2, page 387. Step 2b ii, iii. private DTree[] buildNextKTrees(Frame fr) { // We're going to build K (nclass) trees - each focused on correcting // errors for a single class. final DTree[] ktrees = new DTree[_nclass]; // Initial set of histograms. All trees; one leaf per tree (the root // leaf); all columns DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols]; for (int k = 0; k < _nclass; k++) { // Initially setup as-if an empty-split had just happened if (_distribution == null || _distribution[k] != 0) { // The Boolean Optimization // This optimization assumes the 2nd tree of a 2-class system is the // inverse of the first. This is false for DRF (and true for GBM) - // DRF picks a random different set of columns for the 2nd tree. if (k == 1 && _nclass == 2) continue; ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows); new GBMUndecidedNode( ktrees[k], -1, DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], false)); // The "root" node } } int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from here to tree._len // ---- // ESL2, page 387. Step 2b ii. // One Big Loop till the ktrees are of proper depth. // Adds a layer to the trees each pass. int depth = 0; for (; depth < max_depth; depth++) { if (!Job.isRunning(self())) return null; hcs = buildLayer(fr, ktrees, leafs, hcs, false, false); // If we did not make any new splits, then the tree is split-to-death if (hcs == null) break; } // Each tree bottomed-out in a DecidedNode; go 1 more level and insert // LeafNodes to hold predictions. for (int k = 0; k < _nclass; k++) { DTree tree = ktrees[k]; if (tree == null) continue; int leaf = leafs[k] = tree.len(); for (int nid = 0; nid < leaf; nid++) { if (tree.node(nid) instanceof DecidedNode) { DecidedNode dn = tree.decided(nid); for (int i = 0; i < dn._nids.length; i++) { int cnid = dn._nids[i]; if (cnid == -1 || // Bottomed out (predictors or responses known constant) tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth (tree.node(cnid) instanceof DecidedNode && // Or not possible to split ((DecidedNode) tree.node(cnid))._split.col() == -1)) dn._nids[i] = new GBMLeafNode(tree, nid).nid(); // Mark a leaf here } // Handle the trivial non-splitting tree if (nid == 0 && dn._split.col() == -1) new GBMLeafNode(tree, -1, 0); } } } // -- k-trees are done // ---- // ESL2, page 387. Step 2b iii. Compute the gammas, and store them back // into the tree leaves. Includes learn_rate. // gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|))) // For regression: // gamma_i_k = sum res_i / count(res_i) GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr); double m1class = _nclass > 1 ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; if (tree == null) continue; for (int i = 0; i < tree._len - leafs[k]; i++) { double g = gp._gss[k][i] == 0 // Constant response? ? (gp._rss[k][i] == 0 ? 0 : 1000) // Cap (exponential) learn, instead of dealing with Inf : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i]; assert !Double.isNaN(g); ((LeafNode) tree.node(leafs[k] + i))._pred = g; } } // ---- // ESL2, page 387. Step 2b iv. Cache the sum of all the trees, plus the // new tree, in the 'tree' columns. Also, zap the NIDs for next pass. // Tree <== f(Tree) // Nids <== 0 new MRTask2() { @Override public void map(Chunk chks[]) { // For all tree/klasses for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; if (tree == null) continue; final Chunk nids = chk_nids(chks, k); final Chunk ct = chk_tree(chks, k); for (int row = 0; row < nids._len; row++) { int nid = (int) nids.at80(row); if (nid < 0) continue; ct.set0(row, (float) (ct.at0(row) + ((LeafNode) tree.node(nid))._pred)); nids.set0(row, 0); } } } }.doAll(fr); // Collect leaves stats for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i]; // DEBUG: Print the generated K trees // printGenerateTrees(ktrees); return ktrees; }