Example #1
0
    @Override
    public void onCompletion(CountedCompleter caller) {
      ScoreBuildHistogram sbh = (ScoreBuildHistogram) caller;
      // System.out.println(sbh.profString());

      final int leafk = _leafs[_k];
      int tmax = _tree.len(); // Number of total splits in tree K
      for (int leaf = leafk; leaf < tmax; leaf++) { // Visit all the new splits (leaves)
        DTree.UndecidedNode udn = _tree.undecided(leaf);
        //        System.out.println((_st._nclass==1?"Regression":("Class
        // "+_fr2.vecs()[_st._ncols].domain()[_k]))+",\n  Undecided node:"+udn);
        // Replace the Undecided with the Split decision
        DTree.DecidedNode dn = _st.makeDecided(udn, sbh._hcs[leaf - leafk]);
        //        System.out.println(dn +
        //                           "  > Split: " + dn._split + " L/R:" + dn._split._n0+" +
        // "+dn._split._n1);
        if (dn._split._col == -1) udn.do_not_split();
        else {
          _did_split = true;
          DTree.Split s = dn._split; // Accumulate squared error improvements per variable
          AtomicUtils.FloatArray.add(_improvPerVar, s.col(), (float) (s.pre_split_se() - s.se()));
        }
      }
      _leafs[_k] = tmax; // Setup leafs for next tree level
      int new_leafs = _tree.len() - tmax;
      _hcs[_k] = new DHistogram[new_leafs][ /*ncol*/];
      for (int nl = tmax; nl < _tree.len(); nl++) _hcs[_k][nl - tmax] = _tree.undecided(nl)._hs;
      if (_did_split) _tree._depth++;
    }
Example #2
0
  /**
   * Compute relative variable importance for GBM model.
   *
   * <p>See (45), (35) formulas in Friedman: Greedy Function Approximation: A Gradient boosting
   * machine. Algo used here can be used for computation individual importance of features per
   * output class.
   */
  @Override
  protected VarImp doVarImpCalc(
      GBMModel model, DTree[] ktrees, int tid, Frame validationFrame, boolean scale) {
    assert model.ntrees() - 1 == tid
        : "varimp computation expect model with already serialized trees: tid=" + tid;
    // Iterates over k-tree
    for (DTree t : ktrees) { // Iterate over trees
      if (t != null) {
        for (int n = 0; n < t.len() - t.leaves; n++)
          if (t.node(n) instanceof DecidedNode) { // it is split node
            Split split = t.decided(n)._split;
            _improvPerVar[split._col] += split.improvement(); // least squares improvement
          }
      }
    }
    // Compute variable importance for all trees in model
    float[] varimp = new float[model.nfeatures()];

    int ntreesTotal = model.ntrees() * model.nclasses();
    int maxVar = 0;
    for (int var = 0; var < _improvPerVar.length; var++) {
      varimp[var] = _improvPerVar[var] / ntreesTotal;
      if (varimp[var] > varimp[maxVar]) maxVar = var;
    }
    // GBM scale varimp to scale 0..100
    if (scale) {
      float maxVal = varimp[maxVar];
      for (int var = 0; var < varimp.length; var++) varimp[var] /= maxVal;
    }

    return new VarImp(varimp);
  }
Example #3
0
 // helper for debugging
 @SuppressWarnings("unused")
 protected static void printGenerateTrees(DTree[] trees) {
   for (DTree dtree : trees)
     if (dtree != null) {
       try {
         PrintWriter writer = new PrintWriter("/tmp/h2o-3.tree" + ++counter + ".txt", "UTF-8");
         writer.println(dtree.root().toString2(new StringBuilder(), 0));
         writer.close();
       } catch (FileNotFoundException | UnsupportedEncodingException e) {
         e.printStackTrace();
       }
       System.out.println(dtree.root().toString2(new StringBuilder(), 0));
     }
 }
 private DirDiffElement(
     DTree parent,
     @Nullable DiffElement source,
     @Nullable DiffElement target,
     DType type,
     String name) {
   myParent = parent.getParent();
   myNode = parent;
   myType = type;
   mySource = source;
   mySourceLength = source == null || source.isContainer() ? -1 : source.getSize();
   myTarget = target;
   myTargetLength = target == null || target.isContainer() ? -1 : target.getSize();
   myName = name;
   if (type == DType.ERROR) {
     myDefaultOperation = NONE;
   } else if (isSource()) {
     myDefaultOperation = COPY_TO;
   } else if (isTarget()) {
     myDefaultOperation = COPY_FROM;
   } else if (type == DType.EQUAL) {
     myDefaultOperation = EQUAL;
   } else if (type == DType.CHANGED) {
     assert source != null;
     myDefaultOperation = MERGE;
   }
 }
Example #5
0
 @Override
 public void map(Chunk[] chks) {
   _gss = new double[_nclass][];
   _rss = new double[_nclass][];
   // For all tree/klasses
   for (int k = 0; k < _nclass; k++) {
     final DTree tree = _trees[k];
     final int leaf = _leafs[k];
     if (tree == null) continue; // Empty class is ignored
     // A leaf-biased array of all active Tree leaves.
     final double gs[] = _gss[k] = new double[tree._len - leaf];
     final double rs[] = _rss[k] = new double[tree._len - leaf];
     final Chunk nids = chk_nids(chks, k); // Node-ids  for this tree/class
     final Chunk ress = chk_work(chks, k); // Residuals for this tree/class
     // If we have all constant responses, then we do not split even the
     // root and the residuals should be zero.
     if (tree.root() instanceof LeafNode) continue;
     for (int row = 0; row < nids._len; row++) { // For all rows
       int nid = (int) nids.at80(row); // Get Node to decide from
       if (nid < 0) continue; // Missing response
       if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree
       nid = tree.node(nid)._pid; // Then take parent's decision
       DecidedNode dn = tree.decided(nid); // Must have a decision point
       if (dn._split._col == -1) // Unable to decide?
       dn = tree.decided(nid = dn._pid); // Then take parent's decision
       int leafnid = dn.ns(chks, row); // Decide down to a leafnode
       assert leaf <= leafnid && leafnid < tree._len;
       assert tree.node(leafnid) instanceof LeafNode;
       // Note: I can which leaf/region I end up in, but I do not care for
       // the prediction presented by the tree.  For GBM, we compute the
       // sum-of-residuals (and sum/abs/mult residuals) for all rows in the
       // leaf, and get our prediction from that.
       nids.set0(row, leafnid);
       assert !ress.isNA0(row);
       double res = ress.at0(row);
       double ares = Math.abs(res);
       gs[leafnid - leaf] += _nclass > 1 ? ares * (1 - ares) : 1;
       rs[leafnid - leaf] += res;
     }
   }
 }
Example #6
0
  // --------------------------------------------------------------------------
  // Build the next k-trees, which is trying to correct the residual error from
  // the prior trees.  From LSE2, page 387.  Step 2b ii, iii.
  private DTree[] buildNextKTrees(Frame fr) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];

    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols];

    for (int k = 0; k < _nclass; k++) {
      // Initially setup as-if an empty-split had just happened
      if (_distribution == null || _distribution[k] != 0) {
        // The Boolean Optimization
        // This optimization assumes the 2nd tree of a 2-class system is the
        // inverse of the first.  This is false for DRF (and true for GBM) -
        // DRF picks a random different set of columns for the 2nd tree.
        if (k == 1 && _nclass == 2) continue;
        ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows);
        new GBMUndecidedNode(
            ktrees[k],
            -1,
            DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], false)); // The "root" node
      }
    }
    int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from here to tree._len

    // ----
    // ESL2, page 387.  Step 2b ii.
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    int depth = 0;
    for (; depth < max_depth; depth++) {
      if (!Job.isRunning(self())) return null;

      hcs = buildLayer(fr, ktrees, leafs, hcs, false, false);

      // If we did not make any new splits, then the tree is split-to-death
      if (hcs == null) break;
    }

    // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
    // LeafNodes to hold predictions.
    for (int k = 0; k < _nclass; k++) {
      DTree tree = ktrees[k];
      if (tree == null) continue;
      int leaf = leafs[k] = tree.len();
      for (int nid = 0; nid < leaf; nid++) {
        if (tree.node(nid) instanceof DecidedNode) {
          DecidedNode dn = tree.decided(nid);
          for (int i = 0; i < dn._nids.length; i++) {
            int cnid = dn._nids[i];
            if (cnid == -1
                || // Bottomed out (predictors or responses known constant)
                tree.node(cnid) instanceof UndecidedNode
                || // Or chopped off for depth
                (tree.node(cnid) instanceof DecidedNode
                    && // Or not possible to split
                    ((DecidedNode) tree.node(cnid))._split.col() == -1))
              dn._nids[i] = new GBMLeafNode(tree, nid).nid(); // Mark a leaf here
          }
          // Handle the trivial non-splitting tree
          if (nid == 0 && dn._split.col() == -1) new GBMLeafNode(tree, -1, 0);
        }
      }
    } // -- k-trees are done

    // ----
    // ESL2, page 387.  Step 2b iii.  Compute the gammas, and store them back
    // into the tree leaves.  Includes learn_rate.
    //    gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))
    // For regression:
    //    gamma_i_k = sum res_i / count(res_i)
    GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr);
    double m1class = _nclass > 1 ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k];
      if (tree == null) continue;
      for (int i = 0; i < tree._len - leafs[k]; i++) {
        double g =
            gp._gss[k][i] == 0 // Constant response?
                ? (gp._rss[k][i] == 0
                    ? 0
                    : 1000) // Cap (exponential) learn, instead of dealing with Inf
                : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i];
        assert !Double.isNaN(g);
        ((LeafNode) tree.node(leafs[k] + i))._pred = g;
      }
    }

    // ----
    // ESL2, page 387.  Step 2b iv.  Cache the sum of all the trees, plus the
    // new tree, in the 'tree' columns.  Also, zap the NIDs for next pass.
    // Tree <== f(Tree)
    // Nids <== 0
    new MRTask2() {
      @Override
      public void map(Chunk chks[]) {
        // For all tree/klasses
        for (int k = 0; k < _nclass; k++) {
          final DTree tree = ktrees[k];
          if (tree == null) continue;
          final Chunk nids = chk_nids(chks, k);
          final Chunk ct = chk_tree(chks, k);
          for (int row = 0; row < nids._len; row++) {
            int nid = (int) nids.at80(row);
            if (nid < 0) continue;
            ct.set0(row, (float) (ct.at0(row) + ((LeafNode) tree.node(nid))._pred));
            nids.set0(row, 0);
          }
        }
      }
    }.doAll(fr);

    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++)
      if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i];
    // DEBUG: Print the generated K trees
    // printGenerateTrees(ktrees);

    return ktrees;
  }
Example #7
0
      @Override
      public void map(Chunk[] chks) {
        final Chunk y = importance ? chk_resp(chks) : null; // Response
        final double[] rpred = importance ? new double[1 + _nclass] : null; // Row prediction
        final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data
        final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees
        // Iterate over all rows
        for (int row = 0; row < oobt._len; row++) {
          final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow((int) chk_nids(chks, 0).at8(row));

          // For all tree (i.e., k-classes)
          for (int k = 0; k < _nclass; k++) {
            final DTree tree = _trees[k];
            if (tree == null) continue; // Empty class is ignored
            final Chunk nids = chk_nids(chks, k); // Node-ids  for this tree/class
            int nid = (int) nids.at8(row); // Get Node to decide from
            // Update only out-of-bag rows
            // This is out-of-bag row - but we would like to track on-the-fly prediction for the row
            if (wasOOBRow) {
              final Chunk ct =
                  chk_tree(chks, k); // k-tree working column holding votes for given row
              nid = ScoreBuildHistogram.oob2Nid(nid);
              if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree
              nid = tree.node(nid).pid(); // Then take parent's decision
              int leafnid;
              if (tree.root() instanceof LeafNode) {
                leafnid = 0;
              } else {
                DecidedNode dn = tree.decided(nid); // Must have a decision point
                if (dn._split.col() == -1) // Unable to decide?
                dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision
                leafnid = dn.ns(chks, row); // Decide down to a leafnode
              }
              // Setup Tree(i) - on the fly prediction of i-tree for row-th row
              //   - for classification: cumulative number of votes for this row
              //   - for regression: cumulative sum of prediction of each tree - has to be
              // normalized by number of trees
              double prediction =
                  ((LeafNode) tree.node(leafnid))
                      .pred(); // Prediction for this k-class and this row
              if (importance)
                rpred[1 + k] = (float) prediction; // for both regression and classification
              ct.set(row, (float) (ct.atd(row) + prediction));
            }
            // reset help column for this row and this k-class
            nids.set(row, 0);
          } /* end of k-trees iteration */
          // For this tree this row is out-of-bag - i.e., a tree voted for this row
          if (wasOOBRow) oobt.set(row, oobt.atd(row) + 1); // track number of trees
          if (importance) {
            if (wasOOBRow && !y.isNA(row)) {
              if (isClassifier()) {
                int treePred =
                    getPrediction(
                        rpred,
                        _model._output._priorClassDist,
                        data_row(chks, row, rowdata),
                        _threshold);
                int actuPred = (int) y.at8(row);
                if (treePred == actuPred) rightVotes++; // No miss !
              } else { // regression
                double treePred = rpred[1];
                double actuPred = y.atd(row);
                sse += (actuPred - treePred) * (actuPred - treePred);
              }
              allRows++;
            }
          }
        }
      }
Example #8
0
    // --------------------------------------------------------------------------
    // Build the next random k-trees representing tid-th tree
    private void buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random rand, int tid) {
      // We're going to build K (nclass) trees - each focused on correcting
      // errors for a single class.
      final DTree[] ktrees = new DTree[_nclass];

      // Initial set of histograms.  All trees; one leaf per tree (the root
      // leaf); all columns
      DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols];

      // Adjust real bins for the top-levels
      int adj_nbins = Math.max(_parms._nbins_top_level, _parms._nbins);

      // Use for all k-trees the same seed. NOTE: this is only to make a fair
      // view for all k-trees
      final double[] _distribution = _model._output._distribution;
      long rseed = rand.nextLong();
      // Initially setup as-if an empty-split had just happened
      for (int k = 0; k < _nclass; k++) {
        if (_distribution[k] != 0) { // Ignore missing classes
          // The Boolean Optimization
          // This optimization assumes the 2nd tree of a 2-class system is the
          // inverse of the first (and that the same columns were picked)
          if (k == 1 && _nclass == 2 && _model.binomialOpt()) continue;
          ktrees[k] =
              new DRFTree(
                  fr,
                  _ncols,
                  (char) _parms._nbins,
                  (char) _parms._nbins_cats,
                  (char) _nclass,
                  _parms._min_rows,
                  mtrys,
                  rseed);
          new DRFUndecidedNode(
              ktrees[k],
              -1,
              DHistogram.initialHist(
                  fr, _ncols, adj_nbins, _parms._nbins_cats, hcs[k][0])); // The "root" node
        }
      }

      // Sample - mark the lines by putting 'OUT_OF_BAG' into nid(<klass>) vector
      Timer t_1 = new Timer();
      Sample ss[] = new Sample[_nclass];
      for (int k = 0; k < _nclass; k++)
        if (ktrees[k] != null)
          ss[k] =
              new Sample((DRFTree) ktrees[k], sample_rate)
                  .dfork(0, new Frame(vec_nids(fr, k), vec_resp(fr)), _parms._build_tree_one_node);
      for (int k = 0; k < _nclass; k++) if (ss[k] != null) ss[k].getResult();
      Log.debug("Sampling took: + " + t_1);

      int[] leafs =
          new int[_nclass]; // Define a "working set" of leaf splits, from leafs[i] to tree._len for
      // each tree i

      // ----
      // One Big Loop till the ktrees are of proper depth.
      // Adds a layer to the trees each pass.
      Timer t_2 = new Timer();
      int depth = 0;
      for (; depth < _parms._max_depth; depth++) {
        if (!isRunning()) return;
        hcs =
            buildLayer(
                fr,
                _parms._nbins,
                _parms._nbins_cats,
                ktrees,
                leafs,
                hcs,
                true,
                _parms._build_tree_one_node);
        // If we did not make any new splits, then the tree is split-to-death
        if (hcs == null) break;
      }
      Log.debug("Tree build took: " + t_2);

      // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
      // LeafNodes to hold predictions.
      Timer t_3 = new Timer();
      for (int k = 0; k < _nclass; k++) {
        DTree tree = ktrees[k];
        if (tree == null) continue;
        int leaf = leafs[k] = tree.len();
        for (int nid = 0; nid < leaf; nid++) {
          if (tree.node(nid) instanceof DecidedNode) {
            DecidedNode dn = tree.decided(nid);
            if (dn._split._col == -1) { // No decision here, no row should have this NID now
              if (nid == 0) { // Handle the trivial non-splitting tree
                LeafNode ln = new DRFLeafNode(tree, -1, 0);
                ln._pred =
                    (float)
                        (isClassifier() ? _model._output._priorClassDist[k] : _initialPrediction);
              }
              continue;
            }
            for (int i = 0; i < dn._nids.length; i++) {
              int cnid = dn._nids[i];
              if (cnid == -1
                  || // Bottomed out (predictors or responses known constant)
                  tree.node(cnid) instanceof UndecidedNode
                  || // Or chopped off for depth
                  (tree.node(cnid) instanceof DecidedNode
                      && // Or not possible to split
                      ((DecidedNode) tree.node(cnid))._split.col() == -1)) {
                LeafNode ln = new DRFLeafNode(tree, nid);
                ln._pred = (float) dn.pred(i); // Set prediction into the leaf
                dn._nids[i] = ln.nid(); // Mark a leaf here
              }
            }
          }
        }
      } // -- k-trees are done
      Log.debug("Nodes propagation: " + t_3);

      // ----
      // Move rows into the final leaf rows
      Timer t_4 = new Timer();
      CollectPreds cp =
          new CollectPreds(ktrees, leafs, _model.defaultThreshold())
              .doAll(fr, _parms._build_tree_one_node);

      if (isClassifier())
        asVotes(_treeMeasuresOnOOB)
            .append(cp.rightVotes, cp.allRows); // Track right votes over OOB rows for this tree
      else /* regression */ asSSE(_treeMeasuresOnOOB).append(cp.sse, cp.allRows);
      Log.debug("CollectPreds done: " + t_4);

      // Grow the model by K-trees
      _model._output.addKTrees(ktrees);
    }