Beispiel #1
0
  @Override
  protected DRFModel buildModel(
      DRFModel model, final Frame fr, String names[], String domains[][], final Timer t_build) {
    // Append number of trees participating in on-the-fly scoring
    fr.add("OUT_BAG_TREES", response.makeZero());

    // The RNG used to pick split columns
    Random rand = createRNG(_seed);

    // Prepare working columns
    new SetWrkTask().doAll(fr);

    int tid;
    DTree[] ktrees = null;
    // Prepare tree statistics
    TreeStats tstats = new TreeStats();
    // Build trees until we hit the limit
    for (tid = 0; tid < ntrees; tid++) { // Building tid-tree
      model =
          doScoring(
              model, fr, ktrees, tid, tstats, tid == 0, !hasValidation(), build_tree_one_node);
      // At each iteration build K trees (K = nclass = response column domain size)

      // TODO: parallelize more? build more than k trees at each time, we need to care about
      // temporary data
      // Idea: launch more DRF at once.
      Timer kb_timer = new Timer();
      ktrees = buildNextKTrees(fr, _mtry, sample_rate, rand, tid);
      Log.info(Sys.DRF__, (tid + 1) + ". tree was built " + kb_timer.toString());
      if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore

      // Check latest predictions
      tstats.updateBy(ktrees);
    }

    model = doScoring(model, fr, ktrees, tid, tstats, true, !hasValidation(), build_tree_one_node);
    // Make sure that we did not miss any votes
    assert !importance
            || _treeMeasuresOnOOB.npredictors() == _treeMeasuresOnSOOB[0 /*variable*/].npredictors()
        : "Missing some tree votes in variable importance voting?!";

    return model;
  }
Beispiel #2
0
  // Start by splitting all the data according to some criteria (minimize
  // variance at the leaves).  Record on each row which split it goes to, and
  // assign a split number to it (for next pass).  On *this* pass, use the
  // split-number to build a per-split histogram, with a per-histogram-bucket
  // variance.
  @Override
  protected GBMModel buildModel(
      GBMModel model, final Frame fr, String names[], String domains[][], Timer t_build) {
    // Tag out rows missing the response column
    new ExcludeNAResponse().doAll(fr);

    // Build trees until we hit the limit
    int tid;
    DTree[] ktrees = null; // Trees
    TreeStats tstats = new TreeStats(); // Tree stats
    for (tid = 0; tid < ntrees; tid++) {
      // During first iteration model contains 0 trees, then 0-trees, then 1-tree,...
      // BUT if validation is not specified model does not participate in voting
      // but on-the-fly computed data are used
      model = doScoring(model, fr, ktrees, tid, tstats, false, false, false);
      // ESL2, page 387
      // Step 2a: Compute prediction (prob distribution) from prior tree results:
      //   Work <== f(Tree)
      new ComputeProb().doAll(fr);

      // ESL2, page 387
      // Step 2b i: Compute residuals from the prediction (probability distribution)
      //   Work <== f(Work)
      new ComputeRes().doAll(fr);

      // ESL2, page 387, Step 2b ii, iii, iv
      Timer kb_timer = new Timer();
      ktrees = buildNextKTrees(fr);
      Log.info(Sys.GBM__, (tid + 1) + ". tree was built in " + kb_timer.toString());
      if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore

      // Check latest predictions
      tstats.updateBy(ktrees);
    }
    // Final scoring
    model = doScoring(model, fr, ktrees, tid, tstats, true, false, false);

    return model;
  }
Beispiel #3
0
 /**
  * Creates a new ValueArray with classes. New ValueArray is not aligned with source one
  * unfortunately so have to send results to each chunk owner using Atomic.
  */
 @Override
 public void map(Key key) {
   assert key.home();
   if (Job.isRunning(_job.self())) {
     ValueArray va = DKV.get(_arykey).get();
     AutoBuffer bits = va.getChunk(key);
     long startRow = va.startRow(ValueArray.getChunkIndex(key));
     int rows = va.rpc(ValueArray.getChunkIndex(key));
     int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE);
     long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE);
     long updatedChk = chunk;
     long updatedRow = startRow;
     double[] values = new double[_cols.length - 1];
     ClusterDist cd = new ClusterDist();
     int[] clusters = new int[rows];
     int count = 0;
     for (int row = 0; row < rows; row++) {
       KMeans.datad(va, bits, row, _cols, _normalized, values);
       KMeans.closest(_clusters, values, cd);
       chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE);
       if (chunk != updatedChk) {
         updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow);
         updatedChk = chunk;
         updatedRow = startRow + row;
         count = 0;
       }
       clusters[count++] = cd._cluster;
     }
     if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow);
     _job.updateProgress(1);
   }
   _job = null;
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
Beispiel #4
0
  // --------------------------------------------------------------------------
  // Build the next k-trees, which is trying to correct the residual error from
  // the prior trees.  From LSE2, page 387.  Step 2b ii, iii.
  private DTree[] buildNextKTrees(Frame fr) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];

    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols];

    for (int k = 0; k < _nclass; k++) {
      // Initially setup as-if an empty-split had just happened
      if (_distribution == null || _distribution[k] != 0) {
        // The Boolean Optimization
        // This optimization assumes the 2nd tree of a 2-class system is the
        // inverse of the first.  This is false for DRF (and true for GBM) -
        // DRF picks a random different set of columns for the 2nd tree.
        if (k == 1 && _nclass == 2) continue;
        ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows);
        new GBMUndecidedNode(
            ktrees[k],
            -1,
            DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], false)); // The "root" node
      }
    }
    int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from here to tree._len

    // ----
    // ESL2, page 387.  Step 2b ii.
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    int depth = 0;
    for (; depth < max_depth; depth++) {
      if (!Job.isRunning(self())) return null;

      hcs = buildLayer(fr, ktrees, leafs, hcs, false, false);

      // If we did not make any new splits, then the tree is split-to-death
      if (hcs == null) break;
    }

    // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
    // LeafNodes to hold predictions.
    for (int k = 0; k < _nclass; k++) {
      DTree tree = ktrees[k];
      if (tree == null) continue;
      int leaf = leafs[k] = tree.len();
      for (int nid = 0; nid < leaf; nid++) {
        if (tree.node(nid) instanceof DecidedNode) {
          DecidedNode dn = tree.decided(nid);
          for (int i = 0; i < dn._nids.length; i++) {
            int cnid = dn._nids[i];
            if (cnid == -1
                || // Bottomed out (predictors or responses known constant)
                tree.node(cnid) instanceof UndecidedNode
                || // Or chopped off for depth
                (tree.node(cnid) instanceof DecidedNode
                    && // Or not possible to split
                    ((DecidedNode) tree.node(cnid))._split.col() == -1))
              dn._nids[i] = new GBMLeafNode(tree, nid).nid(); // Mark a leaf here
          }
          // Handle the trivial non-splitting tree
          if (nid == 0 && dn._split.col() == -1) new GBMLeafNode(tree, -1, 0);
        }
      }
    } // -- k-trees are done

    // ----
    // ESL2, page 387.  Step 2b iii.  Compute the gammas, and store them back
    // into the tree leaves.  Includes learn_rate.
    //    gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))
    // For regression:
    //    gamma_i_k = sum res_i / count(res_i)
    GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr);
    double m1class = _nclass > 1 ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k];
      if (tree == null) continue;
      for (int i = 0; i < tree._len - leafs[k]; i++) {
        double g =
            gp._gss[k][i] == 0 // Constant response?
                ? (gp._rss[k][i] == 0
                    ? 0
                    : 1000) // Cap (exponential) learn, instead of dealing with Inf
                : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i];
        assert !Double.isNaN(g);
        ((LeafNode) tree.node(leafs[k] + i))._pred = g;
      }
    }

    // ----
    // ESL2, page 387.  Step 2b iv.  Cache the sum of all the trees, plus the
    // new tree, in the 'tree' columns.  Also, zap the NIDs for next pass.
    // Tree <== f(Tree)
    // Nids <== 0
    new MRTask2() {
      @Override
      public void map(Chunk chks[]) {
        // For all tree/klasses
        for (int k = 0; k < _nclass; k++) {
          final DTree tree = ktrees[k];
          if (tree == null) continue;
          final Chunk nids = chk_nids(chks, k);
          final Chunk ct = chk_tree(chks, k);
          for (int row = 0; row < nids._len; row++) {
            int nid = (int) nids.at80(row);
            if (nid < 0) continue;
            ct.set0(row, (float) (ct.at0(row) + ((LeafNode) tree.node(nid))._pred));
            nids.set0(row, 0);
          }
        }
      }
    }.doAll(fr);

    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++)
      if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i];
    // DEBUG: Print the generated K trees
    // printGenerateTrees(ktrees);

    return ktrees;
  }
Beispiel #5
0
  // --------------------------------------------------------------------------
  // Build the next random k-trees represeint tid-th tree
  private DTree[] buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random rand, int tid) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];

    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols];

    // Use for all k-trees the same seed. NOTE: this is only to make a fair
    // view for all k-trees
    long rseed = rand.nextLong();
    // Initially setup as-if an empty-split had just happened
    for (int k = 0; k < _nclass; k++) {
      assert (_distribution != null && classification)
          || (_distribution == null && !classification);
      if (_distribution == null || _distribution[k] != 0) { // Ignore missing classes
        // The Boolean Optimization
        // This optimization assumes the 2nd tree of a 2-class system is the
        // inverse of the first.  This is false for DRF (and true for GBM) -
        // DRF picks a random different set of columns for the 2nd tree.
        // if( k==1 && _nclass==2 ) continue;
        ktrees[k] = new DRFTree(fr, _ncols, (char) nbins, (char) _nclass, min_rows, mtrys, rseed);
        boolean isBinom = classification;
        new DRFUndecidedNode(
            ktrees[k],
            -1,
            DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], isBinom)); // The "root" node
      }
    }

    // Sample - mark the lines by putting 'OUT_OF_BAG' into nid(<klass>) vector
    Timer t_1 = new Timer();
    Sample ss[] = new Sample[_nclass];
    for (int k = 0; k < _nclass; k++)
      if (ktrees[k] != null)
        ss[k] =
            new Sample((DRFTree) ktrees[k], sample_rate)
                .dfork(0, new Frame(vec_nids(fr, k), vec_resp(fr, k)), build_tree_one_node);
    for (int k = 0; k < _nclass; k++) if (ss[k] != null) ss[k].getResult();
    Log.debug(Sys.DRF__, "Sampling took: + " + t_1);

    int[] leafs =
        new int
            [_nclass]; // Define a "working set" of leaf splits, from leafs[i] to tree._len for each
                       // tree i

    // ----
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    Timer t_2 = new Timer();
    int depth = 0;
    for (; depth < max_depth; depth++) {
      if (!Job.isRunning(self())) return null;

      hcs = buildLayer(fr, ktrees, leafs, hcs, true, build_tree_one_node);

      // If we did not make any new splits, then the tree is split-to-death
      if (hcs == null) break;
    }
    Log.debug(Sys.DRF__, "Tree build took: " + t_2);

    // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
    // LeafNodes to hold predictions.
    Timer t_3 = new Timer();
    for (int k = 0; k < _nclass; k++) {
      DTree tree = ktrees[k];
      if (tree == null) continue;
      int leaf = leafs[k] = tree.len();
      for (int nid = 0; nid < leaf; nid++) {
        if (tree.node(nid) instanceof DecidedNode) {
          DecidedNode dn = tree.decided(nid);
          for (int i = 0; i < dn._nids.length; i++) {
            int cnid = dn._nids[i];
            if (cnid == -1
                || // Bottomed out (predictors or responses known constant)
                tree.node(cnid) instanceof UndecidedNode
                || // Or chopped off for depth
                (tree.node(cnid) instanceof DecidedNode
                    && // Or not possible to split
                    ((DecidedNode) tree.node(cnid))._split.col() == -1)) {
              LeafNode ln = new DRFLeafNode(tree, nid);
              ln._pred = dn.pred(i); // Set prediction into the leaf
              dn._nids[i] = ln.nid(); // Mark a leaf here
            }
          }
          // Handle the trivial non-splitting tree
          if (nid == 0 && dn._split.col() == -1) new DRFLeafNode(tree, -1, 0);
        }
      }
    } // -- k-trees are done
    Log.debug(Sys.DRF__, "Nodes propagation: " + t_3);

    // ----
    // Move rows into the final leaf rows
    Timer t_4 = new Timer();
    CollectPreds cp = new CollectPreds(ktrees, leafs).doAll(fr, build_tree_one_node);
    if (importance) {
      if (classification)
        asVotes(_treeMeasuresOnOOB)
            .append(cp.rightVotes, cp.allRows); // Track right votes over OOB rows for this tree
      else /* regression */ asSSE(_treeMeasuresOnOOB).append(cp.sse, cp.allRows);
    }
    Log.debug(Sys.DRF__, "CollectPreds done: " + t_4);

    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++)
      if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i];
    // DEBUG: Print the generated K trees
    // printGenerateTrees(ktrees);

    return ktrees;
  }
Beispiel #6
0
 /**
  * Check if given job is running.
  *
  * @param job_key job key
  * @return true if job is still running else returns false.
  */
 public static boolean isRunning(Key job_key) {
   Job j = UKV.get(job_key);
   assert j != null : "Job should be always in DKV!";
   return j.isRunning();
 }