@Override public void onCompletion(CountedCompleter caller) { ScoreBuildHistogram sbh = (ScoreBuildHistogram) caller; // System.out.println(sbh.profString()); final int leafk = _leafs[_k]; int tmax = _tree.len(); // Number of total splits in tree K for (int leaf = leafk; leaf < tmax; leaf++) { // Visit all the new splits (leaves) DTree.UndecidedNode udn = _tree.undecided(leaf); // System.out.println((_st._nclass==1?"Regression":("Class // "+_fr2.vecs()[_st._ncols].domain()[_k]))+",\n Undecided node:"+udn); // Replace the Undecided with the Split decision DTree.DecidedNode dn = _st.makeDecided(udn, sbh._hcs[leaf - leafk]); // System.out.println(dn + // " > Split: " + dn._split + " L/R:" + dn._split._n0+" + // "+dn._split._n1); if (dn._split._col == -1) udn.do_not_split(); else { _did_split = true; DTree.Split s = dn._split; // Accumulate squared error improvements per variable AtomicUtils.FloatArray.add(_improvPerVar, s.col(), (float) (s.pre_split_se() - s.se())); } } _leafs[_k] = tmax; // Setup leafs for next tree level int new_leafs = _tree.len() - tmax; _hcs[_k] = new DHistogram[new_leafs][ /*ncol*/]; for (int nl = tmax; nl < _tree.len(); nl++) _hcs[_k][nl - tmax] = _tree.undecided(nl)._hs; if (_did_split) _tree._depth++; }
/** * Compute relative variable importance for GBM model. * * <p>See (45), (35) formulas in Friedman: Greedy Function Approximation: A Gradient boosting * machine. Algo used here can be used for computation individual importance of features per * output class. */ @Override protected VarImp doVarImpCalc( GBMModel model, DTree[] ktrees, int tid, Frame validationFrame, boolean scale) { assert model.ntrees() - 1 == tid : "varimp computation expect model with already serialized trees: tid=" + tid; // Iterates over k-tree for (DTree t : ktrees) { // Iterate over trees if (t != null) { for (int n = 0; n < t.len() - t.leaves; n++) if (t.node(n) instanceof DecidedNode) { // it is split node Split split = t.decided(n)._split; _improvPerVar[split._col] += split.improvement(); // least squares improvement } } } // Compute variable importance for all trees in model float[] varimp = new float[model.nfeatures()]; int ntreesTotal = model.ntrees() * model.nclasses(); int maxVar = 0; for (int var = 0; var < _improvPerVar.length; var++) { varimp[var] = _improvPerVar[var] / ntreesTotal; if (varimp[var] > varimp[maxVar]) maxVar = var; } // GBM scale varimp to scale 0..100 if (scale) { float maxVal = varimp[maxVar]; for (int var = 0; var < varimp.length; var++) varimp[var] /= maxVal; } return new VarImp(varimp); }
// helper for debugging @SuppressWarnings("unused") protected static void printGenerateTrees(DTree[] trees) { for (DTree dtree : trees) if (dtree != null) { try { PrintWriter writer = new PrintWriter("/tmp/h2o-3.tree" + ++counter + ".txt", "UTF-8"); writer.println(dtree.root().toString2(new StringBuilder(), 0)); writer.close(); } catch (FileNotFoundException | UnsupportedEncodingException e) { e.printStackTrace(); } System.out.println(dtree.root().toString2(new StringBuilder(), 0)); } }
private DirDiffElement( DTree parent, @Nullable DiffElement source, @Nullable DiffElement target, DType type, String name) { myParent = parent.getParent(); myNode = parent; myType = type; mySource = source; mySourceLength = source == null || source.isContainer() ? -1 : source.getSize(); myTarget = target; myTargetLength = target == null || target.isContainer() ? -1 : target.getSize(); myName = name; if (type == DType.ERROR) { myDefaultOperation = NONE; } else if (isSource()) { myDefaultOperation = COPY_TO; } else if (isTarget()) { myDefaultOperation = COPY_FROM; } else if (type == DType.EQUAL) { myDefaultOperation = EQUAL; } else if (type == DType.CHANGED) { assert source != null; myDefaultOperation = MERGE; } }
@Override public void map(Chunk[] chks) { _gss = new double[_nclass][]; _rss = new double[_nclass][]; // For all tree/klasses for (int k = 0; k < _nclass; k++) { final DTree tree = _trees[k]; final int leaf = _leafs[k]; if (tree == null) continue; // Empty class is ignored // A leaf-biased array of all active Tree leaves. final double gs[] = _gss[k] = new double[tree._len - leaf]; final double rs[] = _rss[k] = new double[tree._len - leaf]; final Chunk nids = chk_nids(chks, k); // Node-ids for this tree/class final Chunk ress = chk_work(chks, k); // Residuals for this tree/class // If we have all constant responses, then we do not split even the // root and the residuals should be zero. if (tree.root() instanceof LeafNode) continue; for (int row = 0; row < nids._len; row++) { // For all rows int nid = (int) nids.at80(row); // Get Node to decide from if (nid < 0) continue; // Missing response if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree nid = tree.node(nid)._pid; // Then take parent's decision DecidedNode dn = tree.decided(nid); // Must have a decision point if (dn._split._col == -1) // Unable to decide? dn = tree.decided(nid = dn._pid); // Then take parent's decision int leafnid = dn.ns(chks, row); // Decide down to a leafnode assert leaf <= leafnid && leafnid < tree._len; assert tree.node(leafnid) instanceof LeafNode; // Note: I can which leaf/region I end up in, but I do not care for // the prediction presented by the tree. For GBM, we compute the // sum-of-residuals (and sum/abs/mult residuals) for all rows in the // leaf, and get our prediction from that. nids.set0(row, leafnid); assert !ress.isNA0(row); double res = ress.at0(row); double ares = Math.abs(res); gs[leafnid - leaf] += _nclass > 1 ? ares * (1 - ares) : 1; rs[leafnid - leaf] += res; } } }
// -------------------------------------------------------------------------- // Build the next k-trees, which is trying to correct the residual error from // the prior trees. From LSE2, page 387. Step 2b ii, iii. private DTree[] buildNextKTrees(Frame fr) { // We're going to build K (nclass) trees - each focused on correcting // errors for a single class. final DTree[] ktrees = new DTree[_nclass]; // Initial set of histograms. All trees; one leaf per tree (the root // leaf); all columns DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols]; for (int k = 0; k < _nclass; k++) { // Initially setup as-if an empty-split had just happened if (_distribution == null || _distribution[k] != 0) { // The Boolean Optimization // This optimization assumes the 2nd tree of a 2-class system is the // inverse of the first. This is false for DRF (and true for GBM) - // DRF picks a random different set of columns for the 2nd tree. if (k == 1 && _nclass == 2) continue; ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows); new GBMUndecidedNode( ktrees[k], -1, DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], false)); // The "root" node } } int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from here to tree._len // ---- // ESL2, page 387. Step 2b ii. // One Big Loop till the ktrees are of proper depth. // Adds a layer to the trees each pass. int depth = 0; for (; depth < max_depth; depth++) { if (!Job.isRunning(self())) return null; hcs = buildLayer(fr, ktrees, leafs, hcs, false, false); // If we did not make any new splits, then the tree is split-to-death if (hcs == null) break; } // Each tree bottomed-out in a DecidedNode; go 1 more level and insert // LeafNodes to hold predictions. for (int k = 0; k < _nclass; k++) { DTree tree = ktrees[k]; if (tree == null) continue; int leaf = leafs[k] = tree.len(); for (int nid = 0; nid < leaf; nid++) { if (tree.node(nid) instanceof DecidedNode) { DecidedNode dn = tree.decided(nid); for (int i = 0; i < dn._nids.length; i++) { int cnid = dn._nids[i]; if (cnid == -1 || // Bottomed out (predictors or responses known constant) tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth (tree.node(cnid) instanceof DecidedNode && // Or not possible to split ((DecidedNode) tree.node(cnid))._split.col() == -1)) dn._nids[i] = new GBMLeafNode(tree, nid).nid(); // Mark a leaf here } // Handle the trivial non-splitting tree if (nid == 0 && dn._split.col() == -1) new GBMLeafNode(tree, -1, 0); } } } // -- k-trees are done // ---- // ESL2, page 387. Step 2b iii. Compute the gammas, and store them back // into the tree leaves. Includes learn_rate. // gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|))) // For regression: // gamma_i_k = sum res_i / count(res_i) GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr); double m1class = _nclass > 1 ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; if (tree == null) continue; for (int i = 0; i < tree._len - leafs[k]; i++) { double g = gp._gss[k][i] == 0 // Constant response? ? (gp._rss[k][i] == 0 ? 0 : 1000) // Cap (exponential) learn, instead of dealing with Inf : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i]; assert !Double.isNaN(g); ((LeafNode) tree.node(leafs[k] + i))._pred = g; } } // ---- // ESL2, page 387. Step 2b iv. Cache the sum of all the trees, plus the // new tree, in the 'tree' columns. Also, zap the NIDs for next pass. // Tree <== f(Tree) // Nids <== 0 new MRTask2() { @Override public void map(Chunk chks[]) { // For all tree/klasses for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; if (tree == null) continue; final Chunk nids = chk_nids(chks, k); final Chunk ct = chk_tree(chks, k); for (int row = 0; row < nids._len; row++) { int nid = (int) nids.at80(row); if (nid < 0) continue; ct.set0(row, (float) (ct.at0(row) + ((LeafNode) tree.node(nid))._pred)); nids.set0(row, 0); } } } }.doAll(fr); // Collect leaves stats for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i]; // DEBUG: Print the generated K trees // printGenerateTrees(ktrees); return ktrees; }
@Override public void map(Chunk[] chks) { final Chunk y = importance ? chk_resp(chks) : null; // Response final double[] rpred = importance ? new double[1 + _nclass] : null; // Row prediction final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees // Iterate over all rows for (int row = 0; row < oobt._len; row++) { final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow((int) chk_nids(chks, 0).at8(row)); // For all tree (i.e., k-classes) for (int k = 0; k < _nclass; k++) { final DTree tree = _trees[k]; if (tree == null) continue; // Empty class is ignored final Chunk nids = chk_nids(chks, k); // Node-ids for this tree/class int nid = (int) nids.at8(row); // Get Node to decide from // Update only out-of-bag rows // This is out-of-bag row - but we would like to track on-the-fly prediction for the row if (wasOOBRow) { final Chunk ct = chk_tree(chks, k); // k-tree working column holding votes for given row nid = ScoreBuildHistogram.oob2Nid(nid); if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree nid = tree.node(nid).pid(); // Then take parent's decision int leafnid; if (tree.root() instanceof LeafNode) { leafnid = 0; } else { DecidedNode dn = tree.decided(nid); // Must have a decision point if (dn._split.col() == -1) // Unable to decide? dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision leafnid = dn.ns(chks, row); // Decide down to a leafnode } // Setup Tree(i) - on the fly prediction of i-tree for row-th row // - for classification: cumulative number of votes for this row // - for regression: cumulative sum of prediction of each tree - has to be // normalized by number of trees double prediction = ((LeafNode) tree.node(leafnid)) .pred(); // Prediction for this k-class and this row if (importance) rpred[1 + k] = (float) prediction; // for both regression and classification ct.set(row, (float) (ct.atd(row) + prediction)); } // reset help column for this row and this k-class nids.set(row, 0); } /* end of k-trees iteration */ // For this tree this row is out-of-bag - i.e., a tree voted for this row if (wasOOBRow) oobt.set(row, oobt.atd(row) + 1); // track number of trees if (importance) { if (wasOOBRow && !y.isNA(row)) { if (isClassifier()) { int treePred = getPrediction( rpred, _model._output._priorClassDist, data_row(chks, row, rowdata), _threshold); int actuPred = (int) y.at8(row); if (treePred == actuPred) rightVotes++; // No miss ! } else { // regression double treePred = rpred[1]; double actuPred = y.atd(row); sse += (actuPred - treePred) * (actuPred - treePred); } allRows++; } } } }
// -------------------------------------------------------------------------- // Build the next random k-trees representing tid-th tree private void buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random rand, int tid) { // We're going to build K (nclass) trees - each focused on correcting // errors for a single class. final DTree[] ktrees = new DTree[_nclass]; // Initial set of histograms. All trees; one leaf per tree (the root // leaf); all columns DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols]; // Adjust real bins for the top-levels int adj_nbins = Math.max(_parms._nbins_top_level, _parms._nbins); // Use for all k-trees the same seed. NOTE: this is only to make a fair // view for all k-trees final double[] _distribution = _model._output._distribution; long rseed = rand.nextLong(); // Initially setup as-if an empty-split had just happened for (int k = 0; k < _nclass; k++) { if (_distribution[k] != 0) { // Ignore missing classes // The Boolean Optimization // This optimization assumes the 2nd tree of a 2-class system is the // inverse of the first (and that the same columns were picked) if (k == 1 && _nclass == 2 && _model.binomialOpt()) continue; ktrees[k] = new DRFTree( fr, _ncols, (char) _parms._nbins, (char) _parms._nbins_cats, (char) _nclass, _parms._min_rows, mtrys, rseed); new DRFUndecidedNode( ktrees[k], -1, DHistogram.initialHist( fr, _ncols, adj_nbins, _parms._nbins_cats, hcs[k][0])); // The "root" node } } // Sample - mark the lines by putting 'OUT_OF_BAG' into nid(<klass>) vector Timer t_1 = new Timer(); Sample ss[] = new Sample[_nclass]; for (int k = 0; k < _nclass; k++) if (ktrees[k] != null) ss[k] = new Sample((DRFTree) ktrees[k], sample_rate) .dfork(0, new Frame(vec_nids(fr, k), vec_resp(fr)), _parms._build_tree_one_node); for (int k = 0; k < _nclass; k++) if (ss[k] != null) ss[k].getResult(); Log.debug("Sampling took: + " + t_1); int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from leafs[i] to tree._len for // each tree i // ---- // One Big Loop till the ktrees are of proper depth. // Adds a layer to the trees each pass. Timer t_2 = new Timer(); int depth = 0; for (; depth < _parms._max_depth; depth++) { if (!isRunning()) return; hcs = buildLayer( fr, _parms._nbins, _parms._nbins_cats, ktrees, leafs, hcs, true, _parms._build_tree_one_node); // If we did not make any new splits, then the tree is split-to-death if (hcs == null) break; } Log.debug("Tree build took: " + t_2); // Each tree bottomed-out in a DecidedNode; go 1 more level and insert // LeafNodes to hold predictions. Timer t_3 = new Timer(); for (int k = 0; k < _nclass; k++) { DTree tree = ktrees[k]; if (tree == null) continue; int leaf = leafs[k] = tree.len(); for (int nid = 0; nid < leaf; nid++) { if (tree.node(nid) instanceof DecidedNode) { DecidedNode dn = tree.decided(nid); if (dn._split._col == -1) { // No decision here, no row should have this NID now if (nid == 0) { // Handle the trivial non-splitting tree LeafNode ln = new DRFLeafNode(tree, -1, 0); ln._pred = (float) (isClassifier() ? _model._output._priorClassDist[k] : _initialPrediction); } continue; } for (int i = 0; i < dn._nids.length; i++) { int cnid = dn._nids[i]; if (cnid == -1 || // Bottomed out (predictors or responses known constant) tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth (tree.node(cnid) instanceof DecidedNode && // Or not possible to split ((DecidedNode) tree.node(cnid))._split.col() == -1)) { LeafNode ln = new DRFLeafNode(tree, nid); ln._pred = (float) dn.pred(i); // Set prediction into the leaf dn._nids[i] = ln.nid(); // Mark a leaf here } } } } } // -- k-trees are done Log.debug("Nodes propagation: " + t_3); // ---- // Move rows into the final leaf rows Timer t_4 = new Timer(); CollectPreds cp = new CollectPreds(ktrees, leafs, _model.defaultThreshold()) .doAll(fr, _parms._build_tree_one_node); if (isClassifier()) asVotes(_treeMeasuresOnOOB) .append(cp.rightVotes, cp.allRows); // Track right votes over OOB rows for this tree else /* regression */ asSSE(_treeMeasuresOnOOB).append(cp.sse, cp.allRows); Log.debug("CollectPreds done: " + t_4); // Grow the model by K-trees _model._output.addKTrees(ktrees); }