Esempio n. 1
0
  @Override
  protected DRFModel buildModel(
      DRFModel model, final Frame fr, String names[], String domains[][], final Timer t_build) {
    // Append number of trees participating in on-the-fly scoring
    fr.add("OUT_BAG_TREES", response.makeZero());

    // The RNG used to pick split columns
    Random rand = createRNG(_seed);

    // Prepare working columns
    new SetWrkTask().doAll(fr);

    int tid;
    DTree[] ktrees = null;
    // Prepare tree statistics
    TreeStats tstats = new TreeStats();
    // Build trees until we hit the limit
    for (tid = 0; tid < ntrees; tid++) { // Building tid-tree
      model =
          doScoring(
              model, fr, ktrees, tid, tstats, tid == 0, !hasValidation(), build_tree_one_node);
      // At each iteration build K trees (K = nclass = response column domain size)

      // TODO: parallelize more? build more than k trees at each time, we need to care about
      // temporary data
      // Idea: launch more DRF at once.
      Timer kb_timer = new Timer();
      ktrees = buildNextKTrees(fr, _mtry, sample_rate, rand, tid);
      Log.info(Sys.DRF__, (tid + 1) + ". tree was built " + kb_timer.toString());
      if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore

      // Check latest predictions
      tstats.updateBy(ktrees);
    }

    model = doScoring(model, fr, ktrees, tid, tstats, true, !hasValidation(), build_tree_one_node);
    // Make sure that we did not miss any votes
    assert !importance
            || _treeMeasuresOnOOB.npredictors() == _treeMeasuresOnSOOB[0 /*variable*/].npredictors()
        : "Missing some tree votes in variable importance voting?!";

    return model;
  }
Esempio n. 2
0
  // Start by splitting all the data according to some criteria (minimize
  // variance at the leaves).  Record on each row which split it goes to, and
  // assign a split number to it (for next pass).  On *this* pass, use the
  // split-number to build a per-split histogram, with a per-histogram-bucket
  // variance.
  @Override
  protected GBMModel buildModel(
      GBMModel model, final Frame fr, String names[], String domains[][], Timer t_build) {
    // Tag out rows missing the response column
    new ExcludeNAResponse().doAll(fr);

    // Build trees until we hit the limit
    int tid;
    DTree[] ktrees = null; // Trees
    TreeStats tstats = new TreeStats(); // Tree stats
    for (tid = 0; tid < ntrees; tid++) {
      // During first iteration model contains 0 trees, then 0-trees, then 1-tree,...
      // BUT if validation is not specified model does not participate in voting
      // but on-the-fly computed data are used
      model = doScoring(model, fr, ktrees, tid, tstats, false, false, false);
      // ESL2, page 387
      // Step 2a: Compute prediction (prob distribution) from prior tree results:
      //   Work <== f(Tree)
      new ComputeProb().doAll(fr);

      // ESL2, page 387
      // Step 2b i: Compute residuals from the prediction (probability distribution)
      //   Work <== f(Work)
      new ComputeRes().doAll(fr);

      // ESL2, page 387, Step 2b ii, iii, iv
      Timer kb_timer = new Timer();
      ktrees = buildNextKTrees(fr);
      Log.info(Sys.GBM__, (tid + 1) + ". tree was built in " + kb_timer.toString());
      if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore

      // Check latest predictions
      tstats.updateBy(ktrees);
    }
    // Final scoring
    model = doScoring(model, fr, ktrees, tid, tstats, true, false, false);

    return model;
  }
Esempio n. 3
0
    @Override
    protected void buildModel() {
      // Start with class distribution as null-model
      // FIXME: Test/Investigate this
      //      if( _nclass >= 2 ) {
      //        for( int c=0; c<_nclass; c++ ) {
      //          final double init = _model._output._priorClassDist[c];
      //          new MRTask() {
      //            @Override public void map(Chunk tree) { for( int i=0; i<tree._len; i++ )
      // tree.set(i, init); }
      //          }.doAll(vec_tree(_train,c));
      //        }
      //      }

      _mtry =
          (_parms._mtries == -1)
              ? // classification: mtry=sqrt(_ncols), regression: mtry=_ncols/3
              (isClassifier() ? Math.max((int) Math.sqrt(_ncols), 1) : Math.max(_ncols / 3, 1))
              : _parms._mtries;
      if (!(1 <= _mtry && _mtry <= _ncols))
        throw new IllegalArgumentException(
            "Computed mtry should be in interval <1,#cols> but it is " + _mtry);
      // Initialize TreeVotes for classification, MSE arrays for regression
      initTreeMeasurements();
      // Append number of trees participating in on-the-fly scoring
      _train.add("OUT_BAG_TREES", _response.makeZero());
      // Prepare working columns
      new SetWrkTask().doAll(_train);
      // If there was a check point recompute tree_<_> and oob columns based on predictions from
      // previous trees
      // but only if OOB validation is requested.
      if (_parms._checkpoint) {
        Timer t = new Timer();
        // Compute oob votes for each output level
        new OOBScorer(
                _ncols,
                _nclass,
                (hasWeights() ? 1 : 0) + (hasOffset() ? 1 : 0),
                _parms._sample_rate,
                _model._output._treeKeys)
            .doAll(_train);
        Log.info("Reconstructing oob stats from checkpointed model took " + t);
      }

      // The RNG used to pick split columns
      Random rand = createRNG(_parms._seed);
      // To be deterministic get random numbers for previous trees and
      // put random generator to the same state
      for (int i = 0; i < _ntreesFromCheckpoint; i++) rand.nextLong();

      int tid;
      DTree[] ktrees = null;
      // Prepare tree statistics
      // Build trees until we hit the limit
      for (tid = 0; tid < _parms._ntrees; tid++) { // Building tid-tree
        if (tid != 0 || !_parms._checkpoint) { // do not make initial scoring if model already exist
          double training_r2 = doScoringAndSaveModel(false, true, _parms._build_tree_one_node);
          if (training_r2 >= _parms._r2_stopping) return; // Stop when approaching round-off error
        }
        // At each iteration build K trees (K = nclass = response column domain size)

        // TODO: parallelize more? build more than k trees at each time, we need to care about
        // temporary data
        // Idea: launch more DRF at once.
        Timer kb_timer = new Timer();
        buildNextKTrees(_train, _mtry, _parms._sample_rate, rand, tid);
        Log.info((tid + 1) + ". tree was built " + kb_timer.toString());
        DRF.this.update(1);
        if (!isRunning()) return; // If canceled during building, do not bulkscore
      }
      doScoringAndSaveModel(true, true, _parms._build_tree_one_node);
    }