protected double doScoringAndSaveModel( boolean finalScoring, boolean oob, boolean build_tree_one_node) { double training_r2 = Double.NaN; // Training R^2 value, if computed long now = System.currentTimeMillis(); if (_firstScore == 0) _firstScore = now; long sinceLastScore = now - _timeLastScoreStart; boolean updated = false; new ProgressUpdate( "Built " + _model._output._ntrees + " trees so far (out of " + _parms._ntrees + ").") .fork(_progressKey); // Now model already contains tid-trees in serialized form if (_parms._score_each_iteration || finalScoring || (now - _firstScore < 4000) || // Score every time for 4 secs // Throttle scoring to keep the cost sane; limit to a 10% duty cycle & every 4 secs (sinceLastScore > 4000 && // Limit scoring updates to every 4sec (double) (_timeLastScoreEnd - _timeLastScoreStart) / sinceLastScore < 0.1)) { // 10% duty cycle checkMemoryFootPrint(); // If validation is specified we use a model for scoring, so we need to // update it! First we save model with trees (i.e., make them available // for scoring) and then update it with resulting error _model.update(_key); updated = true; Log.info("============================================================== "); SharedTreeModel.SharedTreeOutput out = _model._output; _timeLastScoreStart = now; // Score on training data new ProgressUpdate("Scoring the model.").fork(_progressKey); Score sc = new Score(this, true, oob, _model._output.getModelCategory()) .doAll(train(), build_tree_one_node); ModelMetrics mm = sc.makeModelMetrics(_model, _parms.train()); out._training_metrics = mm; if (oob) out._training_metrics._description = "Metrics reported on Out-Of-Bag training samples"; out._scored_train[out._ntrees].fillFrom(mm); if (out._ntrees > 0) Log.info("Training " + out._scored_train[out._ntrees].toString()); // Score again on validation data if (_parms._valid != null) { Score scv = new Score(this, false, false, _model._output.getModelCategory()) .doAll(valid(), build_tree_one_node); ModelMetrics mmv = scv.makeModelMetrics(_model, _parms.valid()); out._validation_metrics = mmv; out._scored_valid[out._ntrees].fillFrom(mmv); if (out._ntrees > 0) Log.info("Validation " + out._scored_valid[out._ntrees].toString()); } if (out._ntrees > 0) { // Compute variable importances out._model_summary = createModelSummaryTable(out); out._scoring_history = createScoringHistoryTable(out); out._varimp = new hex.VarImp(_improvPerVar, out._names); out._variable_importances = hex.ModelMetrics.calcVarImp(out._varimp); Log.info(out._model_summary.toString()); // For Debugging: // Log.info(out._scoring_history.toString()); // Log.info(out._variable_importances.toString()); } ConfusionMatrix cm = mm.cm(); if (cm != null) { if (cm._cm.length <= _parms._max_confusion_matrix_size) { Log.info(cm.toASCII()); } else { Log.info( "Confusion Matrix is too large (max_confusion_matrix_size=" + _parms._max_confusion_matrix_size + "): " + _nclass + " classes."); } } _timeLastScoreEnd = System.currentTimeMillis(); } // Double update - after either scoring or variable importance if (updated) _model.update(_key); return training_r2; }
// Main worker thread @Override protected void compute2() { KMeansModel model = null; try { init(true); // Do lock even before checking the errors, since this block is finalized by unlock // (not the best solution, but the code is more readable) _parms.read_lock_frames(KMeans.this); // Fetch & read-lock input frames // Something goes wrong if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(KMeans.this); // The model to be built model = new KMeansModel(dest(), _parms, new KMeansModel.KMeansOutput(KMeans.this)); model.delete_and_lock(_key); // final Vec vecs[] = _train.vecs(); // mults & means for standardization final double[] means = _train.means(); // means are used to impute NAs final double[] mults = _parms._standardize ? _train.mults() : null; final int[] impute_cat = new int[vecs.length]; for (int i = 0; i < vecs.length; i++) impute_cat[i] = vecs[i].isNumeric() ? -1 : DataInfo.imputeCat(vecs[i]); model._output._normSub = means; model._output._normMul = mults; // Initialize cluster centers and standardize if requested double[][] centers = initial_centers(model, vecs, means, mults, impute_cat); if (centers == null) return; // Stopped/cancelled during center-finding double[][] oldCenters = null; // --- // Run the main KMeans Clustering loop // Stop after enough iterations or average_change < TOLERANCE model._output._iterations = 0; // Loop ends only when iterations > max_iterations with strict inequality while (!isDone(model, centers, oldCenters)) { Lloyds task = new Lloyds(centers, means, mults, impute_cat, _isCats, _parms._k, hasWeightCol()) .doAll(vecs); // Pick the max categorical level for cluster center max_cats(task._cMeans, task._cats, _isCats); // Handle the case where some centers go dry. Rescue only 1 cluster // per iteration ('cause we only tracked the 1 worst row) if (cleanupBadClusters(task, vecs, centers, means, mults, impute_cat)) continue; // Compute model stats; update standardized cluster centers oldCenters = centers; centers = computeStatsFillModel(task, model, vecs, means, mults, impute_cat); model.update(_key); // Update model in K/V store update(1); // One unit of work if (model._parms._score_each_iteration) Log.info(model._output._model_summary); } Log.info(model._output._model_summary); // Log.info(model._output._scoring_history); // // Log.info(((ModelMetricsClustering)model._output._training_metrics).createCentroidStatsTable().toString()); // At the end: validation scoring (no need to gather scoring history) if (_valid != null) { model.score(_parms.valid()).delete(); // this appends a ModelMetrics on the validation set model._output._validation_metrics = ModelMetrics.getFromDKV(model, _parms.valid()); model.update(_key); // Update model in K/V store } done(); // Job done! } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { updateModelOutput(); if (model != null) model.unlock(_key); _parms.read_unlock_frames(KMeans.this); } tryComplete(); }
@Override protected void compute2() { _model = null; // Resulting model! try { Scope.enter(); // Cleanup temp keys init(true); // Do any expensive tests & conversions now // Do lock even before checking the errors, since this block is finalized by unlock // (not the best solution, but the code is more readable) _parms.read_lock_frames(SharedTree.this); // Fetch & read-lock input frames if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this); // New Model? Or continuing from a checkpoint? if (_parms._checkpoint && DKV.get(_parms._model_id) != null) { _model = DKV.get(_dest).get(); _model.write_lock(_key); // do not delete previous model; we are extending it } else { // New Model // Compute the zero-tree error - guessing only the class distribution. // MSE is stddev squared when guessing for regression. // For classification, guess the largest class. _model = makeModel( _dest, _parms, initial_MSE(_response, _response), _valid == null ? Double.NaN : initial_MSE(_response, _vresponse)); // Make a fresh model _model.delete_and_lock(_key); // and clear & write-lock it (smashing any prior) _model._output._init_f = _initialPrediction; } // Compute the response domain; makes for nicer printouts String[] domain = _response.domain(); assert (_nclass > 1 && domain != null) || (_nclass == 1 && domain == null); if (_nclass == 1) domain = new String[] {"r"}; // For regression, give a name to class 0 // Compute class distribution, used to for initial guesses and to // upsample minority classes (if asked for). if (_nclass > 1) { // Classification? // Handle imbalanced classes by stratified over/under-sampling. // initWorkFrame sets the modeled class distribution, and // model.score() corrects the probabilities back using the // distribution ratios if (_model._output.isClassifier() && _parms._balance_classes) { float[] trainSamplingFactors = new float [_train .lastVec() .domain() .length]; // leave initialized to 0 -> will be filled up below if (_parms._class_sampling_factors != null) { if (_parms._class_sampling_factors.length != _train.lastVec().domain().length) throw new IllegalArgumentException( "class_sampling_factors must have " + _train.lastVec().domain().length + " elements"); trainSamplingFactors = _parms._class_sampling_factors.clone(); // clone: don't modify the original } Frame stratified = water.util.MRUtils.sampleFrameStratified( _train, _train.lastVec(), _train.vec(_model._output.weightsName()), trainSamplingFactors, (long) (_parms._max_after_balance_size * _train.numRows()), _parms._seed, true, false); if (stratified != _train) { _train = stratified; _response = stratified.vec(_parms._response_column); _weights = stratified.vec(_parms._weights_column); // Recompute distribution since the input frame was modified MRUtils.ClassDist cdmt2 = _weights != null ? new MRUtils.ClassDist(_nclass).doAll(_response, _weights) : new MRUtils.ClassDist(_nclass).doAll(_response); _model._output._distribution = cdmt2.dist(); _model._output._modelClassDist = cdmt2.rel_dist(); } } Log.info("Prior class distribution: " + Arrays.toString(_model._output._priorClassDist)); Log.info("Model class distribution: " + Arrays.toString(_model._output._modelClassDist)); } // Also add to the basic working Frame these sets: // nclass Vecs of current forest results (sum across all trees) // nclass Vecs of working/temp data // nclass Vecs of NIDs, allowing 1 tree per class // Current forest values: results of summing the prior M trees for (int i = 0; i < _nclass; i++) _train.add("Tree_" + domain[i], _response.makeZero()); // Initial work columns. Set-before-use in the algos. for (int i = 0; i < _nclass; i++) _train.add("Work_" + domain[i], _response.makeZero()); // One Tree per class, each tree needs a NIDs. For empty classes use a -1 // NID signifying an empty regression tree. for (int i = 0; i < _nclass; i++) _train.add( "NIDs_" + domain[i], _response.makeCon( _model._output._distribution == null ? 0 : (_model._output._distribution[i] == 0 ? -1 : 0))); // Tag out rows missing the response column new ExcludeNAResponse().doAll(_train); // Variable importance: squared-error-improvement-per-variable-per-split _improvPerVar = new float[_ncols]; // Sub-class tree-model-builder specific build code buildModel(); done(); // Job done! } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { if (_model != null) _model.unlock(_key); _parms.read_unlock_frames(SharedTree.this); if (_model == null) Scope.exit(); else { Scope.exit( _model._key, ModelMetrics.buildKey(_model, _parms.train()), ModelMetrics.buildKey(_model, _parms.valid())); } } tryComplete(); }