public void onException(Throwable ex) { UKV.remove(dest()); Value v = DKV.get(progressKey()); if (v != null) { ChunkProgress p = v.get(); p = p.error(ex.getMessage()); DKV.put(progressKey(), p); } cancel(ex); }
private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to * be fast; heavy-weight prep needs to wait for the trainModel() call. * * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to * predict on; validate a checkpoint. */ @Override public void init(boolean expensive) { super.init(expensive); if (H2O.ARGS.client && _parms._build_tree_one_node) error("_build_tree_one_node", "Cannot run on a single node in client mode"); if (_vresponse != null) _vresponse_key = _vresponse._key; if (_response != null) _response_key = _response._key; if (_nclass > SharedTreeModel.SharedTreeParameters.MAX_SUPPORTED_LEVELS) error("_nclass", "Too many levels in response column!"); if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0"); if (_parms._ntrees < 0 || _parms._ntrees > 100000) error("_ntrees", "Requested ntrees must be between 1 and 100000"); _ntrees = _parms._ntrees; // Total trees in final model if (_parms._checkpoint) { // Asking to continue from checkpoint? Value cv = DKV.get(_parms._model_id); if (cv != null) { // Look for prior model M checkpointModel = cv.get(); if (_parms._ntrees < checkpointModel._output._ntrees + 1) error( "_ntrees", "Requested ntrees must be between " + checkpointModel._output._ntrees + 1 + " and 100000"); _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees } } if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1."); if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16)); if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1."); if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16)); if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0."); if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0."); if (_parms._distribution == Distributions.Family.tweedie) { _parms._distribution.tweedie.p = _parms._tweedie_power; } if (_train != null) { double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1); if (sumWeights < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once error( "_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows + ": must have at least " + 2 * _parms._min_rows + " (weighted) rows, but have only " + sumWeights + "."); } if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols(); }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to * be fast; heavy-weight prep needs to wait for the trainModel() call. * * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to * predict on; validate a checkpoint. */ @Override public void init(boolean expensive) { super.init(expensive); if (H2O.ARGS.client && _parms._build_tree_one_node) error("_build_tree_one_node", "Cannot run on a single node in client mode"); if (_vresponse != null) _vresponse_key = _vresponse._key; if (_response != null) _response_key = _response._key; if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0"); if (_parms._ntrees < 0 || _parms._ntrees > MAX_NTREES) error("_ntrees", "Requested ntrees must be between 1 and " + MAX_NTREES); _ntrees = _parms._ntrees; // Total trees in final model if (_parms.hasCheckpoint()) { // Asking to continue from checkpoint? Value cv = DKV.get(_parms._checkpoint); if (cv != null) { // Look for prior model M checkpointModel = cv.get(); try { _parms.validateWithCheckpoint(checkpointModel._parms); } catch (H2OIllegalArgumentException e) { error(e.values.get("argument").toString(), e.values.get("value").toString()); } if (_parms._ntrees < checkpointModel._output._ntrees + 1) error( "_ntrees", "If checkpoint is specified then requested ntrees must be higher than " + (checkpointModel._output._ntrees + 1)); // Compute number of trees to build for this checkpoint _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees } } if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1."); if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16)); if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1."); if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16)); if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0."); if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0."); if (_train != null) { double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1); if (sumWeights < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once error( "_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows + ": must have at least " + 2 * _parms._min_rows + " (weighted) rows, but have only " + sumWeights + "."); } if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols(); }
/** * TimeAveraging as part of Elastic Averaging Algorithm Cf. equation 6 of arXiv:1412.6651v5 * * @param nodeAverageModel current average of per-node models * @return Time-average of node-averages (consensus model, "the" model) */ public static DeepLearningModelInfo timeAverage(DeepLearningModelInfo nodeAverageModel) { float pa = (float) nodeAverageModel.get_params()._elastic_averaging_moving_rate; assert (pa > 0 && pa <= 1); DeepLearningModelInfo elasticAverage = DKV.getGet(nodeAverageModel.elasticAverageModelInfoKey()); // get latest version from DKV if (elasticAverage == null || pa == 1) { elasticAverage = nodeAverageModel.deep_clone(); } else { nodeAverageModel.mult(pa); elasticAverage.mult(1 - pa); elasticAverage.add(nodeAverageModel); // ignore processed local value set here elasticAverage.set_processed_global(nodeAverageModel.get_processed_global()); } elasticAverage.set_processed_local(0); DKV.put(elasticAverage.elasticAverageModelInfoKey(), elasticAverage); // nodeAverageModel.computeStats(); // elasticAverage.computeStats(); // Log.info("Local Model :\n" + nodeAverageModel.toString()); // Log.info("Elastic Average:\n" + elasticAverage.toString()); return elasticAverage; }
@Override protected void setupLocal() { _model_mem_size = 0; for (int i = 0; i < trees_so_far; ++i) { Key<CompressedTree>[] per_class = _treeKeys[i]; for (int j = 0; j < per_class.length; ++j) { if (per_class[j] == null) continue; if (!per_class[j].home()) continue; // only look at homed tree keys _model_mem_size += DKV.get(per_class[j])._max; } } }
// GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell // (2015), Section 5.3) private Frame reconstruct( Frame orig, Frame adaptedFr, Key destination_key, boolean save_imputed, boolean reverse_transform) { final int ncols = _output._names.length; assert ncols == adaptedFr.numCols(); String prefix = "reconstr_"; // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame // Note: A is adapted to original training frame, P has columns shuffled so cats come before // nums! Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); String[][] adaptedDomme = adaptedFr.domains(); for (int i = 0; i < ncols; i++) { Vec v = fullFrm.anyVec().makeZero(); v.setDomain(adaptedDomme[i]); fullFrm.add(prefix + _output._names[i], v); } GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm); // Return the imputed training frame int x = ncols + _parms._k, y = fullFrm.numCols(); Frame f = fullFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs()); DKV.put(f); gs._mb.makeModelMetrics( GLRMModel.this, orig, null, null); // save error metrics based on imputed data return f; }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; final Path p; if (_iceRoot != null) { p = new Path(_iceRoot, getIceName(v)); } else { // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key if (k.toString().endsWith(Extensions.HEX)) { // Hex file? int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header? skip += value_len; } } p = new Path(k.toString()); } final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
/** * Helper to perform the generic part of cross validation Expected to be called from each * specific instance's crossValidate method * * @param splits Frames containing train/test splits * @param offsets Array to store the offsets of starting row indices for each cross-validation * run * @param i Which fold of cross-validation to perform */ protected final void genericCrossValidation(Frame[] splits, long[] offsets, int i) { int respidx = source.find(_responseName); assert (respidx != -1) : "response is not found in source!"; job_key = Key.make(job_key.toString() + "_xval" + i); // make a new Job for CV assert (xval_models != null); destination_key = xval_models[i]; source = splits[0]; validation = splits[1]; response = source.vecs()[respidx]; n_folds = 0; state = Job.JobState.CREATED; // Hack to allow this job to run DKV.put(self(), this); // Needed to pass the Job.isRunning(cvdl.self()) check in FrameTask offsets[i + 1] = offsets[i] + validation.numRows(); _cv = true; // Hack to allow init() to pass for ColumnsJob (allow cols/ignored_cols to co-exist) invoke(); }
/** * Performs deep clone of given model. * * <p>FIXME: fetch all data to the caller node */ protected M getModelDeepClone(M model) { M newModel = IcedUtils.clone(model, _dest); // Do not clone model metrics newModel._output._model_metrics = new Key[0]; newModel._output._training_metrics = null; newModel._output._validation_metrics = null; // Clone trees Key[][] treeKeys = newModel._output._treeKeys; for (int i = 0; i < treeKeys.length; i++) { for (int j = 0; j < treeKeys[i].length; j++) { if (treeKeys[i][j] == null) continue; ; CompressedTree ct = DKV.get(treeKeys[i][j]).get(); CompressedTree newCt = IcedUtils.clone(ct, CompressedTree.makeTreeKey(i, j), true); treeKeys[i][j] = newCt._key; } } return newModel; }
@Override public void map(Key key) { _rows = new long[_clusters.length]; _dist = new double[_clusters.length]; assert key.home(); ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); int rows = va.rpc(ValueArray.getChunkIndex(key)); double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); _rows[cd._cluster]++; _dist[cd._cluster] += cd._dist; } _arykey = null; _cols = null; _clusters = null; }
public ModelMetricsGLRM scoreMetricsOnly(Frame frame) { final int ncols = _output._names.length; // Need [A,X] where A = adapted test frame, X = loading frame // Note: A is adapted to original training frame Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); // Append loading frame X for calculating XY Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm); ModelMetrics mm = gs._mb.makeModelMetrics( GLRMModel.this, adaptedFr, null, null); // save error metrics based on imputed data return (ModelMetricsGLRM) mm; }
private void cancel(final String msg, JobState resultingState) { if (resultingState == JobState.CANCELLED) { Log.info("Job " + self() + "(" + description + ") was cancelled."); } else { Log.err("Job " + self() + "(" + description + ") failed."); Log.err(msg); } exception = msg; state = resultingState; // replace finished job by a job handle replaceByJobHandle(); DKV.write_barrier(); final Job job = this; H2O.submitTask( new H2OCountedCompleter() { @Override public void compute2() { job.onCancelled(); } }); }
/** * Creates a new ValueArray with classes. New ValueArray is not aligned with source one * unfortunately so have to send results to each chunk owner using Atomic. */ @Override public void map(Key key) { assert key.home(); if (Job.isRunning(_job.self())) { ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); long startRow = va.startRow(ValueArray.getChunkIndex(key)); int rows = va.rpc(ValueArray.getChunkIndex(key)); int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE); long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE); long updatedChk = chunk; long updatedRow = startRow; double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); int[] clusters = new int[rows]; int count = 0; for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE); if (chunk != updatedChk) { updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow); updatedChk = chunk; updatedRow = startRow + row; count = 0; } clusters[count++] = cd._cluster; } if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow); _job.updateProgress(1); } _job = null; _arykey = null; _cols = null; _clusters = null; }
@Override protected void compute2() { CoxPHModel model = null; try { Scope.enter(); _parms.read_lock_frames(CoxPH.this); init(true); applyScoringFrameSideEffects(); // The model to be built model = new CoxPHModel(dest(), _parms, new CoxPHModel.CoxPHOutput(CoxPH.this)); model.delete_and_lock(_key); applyTrainingFrameSideEffects(); int nResponses = 1; boolean useAllFactorLevels = false; final DataInfo dinfo = new DataInfo( Key.make(), _modelBuilderTrain, null, nResponses, useAllFactorLevels, DataInfo.TransformType.DEMEAN, TransformType.NONE, true, false, false, false, false, false); initStats(model, dinfo); final int n_offsets = (model._parms.offset_columns == null) ? 0 : model._parms.offset_columns.length; final int n_coef = dinfo.fullN() - n_offsets; final double[] step = MemoryManager.malloc8d(n_coef); final double[] oldCoef = MemoryManager.malloc8d(n_coef); final double[] newCoef = MemoryManager.malloc8d(n_coef); Arrays.fill(step, Double.NaN); Arrays.fill(oldCoef, Double.NaN); for (int j = 0; j < n_coef; ++j) newCoef[j] = model._parms.init; double oldLoglik = -Double.MAX_VALUE; final int n_time = (int) (model._output.max_time - model._output.min_time + 1); final boolean has_start_column = (model._parms.start_column != null); final boolean has_weights_column = (model._parms.weights_column != null); for (int i = 0; i <= model._parms.iter_max; ++i) { model._output.iter = i; final CoxPHTask coxMR = new CoxPHTask( self(), dinfo, newCoef, model._output.min_time, n_time, n_offsets, has_start_column, has_weights_column) .doAll(dinfo._adaptedFrame); final double newLoglik = calcLoglik(model, coxMR); if (newLoglik > oldLoglik) { if (i == 0) calcCounts(model, coxMR); calcModelStats(model, newCoef, newLoglik); calcCumhaz_0(model, coxMR); if (newLoglik == 0) model._output.lre = -Math.log10(Math.abs(oldLoglik - newLoglik)); else model._output.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik)); if (model._output.lre >= model._parms.lre_min) break; Arrays.fill(step, 0); for (int j = 0; j < n_coef; ++j) for (int k = 0; k < n_coef; ++k) step[j] -= model._output.var_coef[j][k] * model._output.gradient[k]; for (int j = 0; j < n_coef; ++j) if (Double.isNaN(step[j]) || Double.isInfinite(step[j])) break; oldLoglik = newLoglik; System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length); } else { for (int j = 0; j < n_coef; ++j) step[j] /= 2; } for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j]; } model.update(_key); } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { updateModelOutput(); _parms.read_unlock_frames(CoxPH.this); Scope.exit(); done(); // Job done! } tryComplete(); }
// Main worker thread @Override protected void compute2() { KMeansModel model = null; try { init(true); // Do lock even before checking the errors, since this block is finalized by unlock // (not the best solution, but the code is more readable) _parms.read_lock_frames(KMeans.this); // Fetch & read-lock input frames // Something goes wrong if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(KMeans.this); // The model to be built model = new KMeansModel(dest(), _parms, new KMeansModel.KMeansOutput(KMeans.this)); model.delete_and_lock(_key); // final Vec vecs[] = _train.vecs(); // mults & means for standardization final double[] means = _train.means(); // means are used to impute NAs final double[] mults = _parms._standardize ? _train.mults() : null; final int[] impute_cat = new int[vecs.length]; for (int i = 0; i < vecs.length; i++) impute_cat[i] = vecs[i].isNumeric() ? -1 : DataInfo.imputeCat(vecs[i]); model._output._normSub = means; model._output._normMul = mults; // Initialize cluster centers and standardize if requested double[][] centers = initial_centers(model, vecs, means, mults, impute_cat); if (centers == null) return; // Stopped/cancelled during center-finding double[][] oldCenters = null; // --- // Run the main KMeans Clustering loop // Stop after enough iterations or average_change < TOLERANCE model._output._iterations = 0; // Loop ends only when iterations > max_iterations with strict inequality while (!isDone(model, centers, oldCenters)) { Lloyds task = new Lloyds(centers, means, mults, impute_cat, _isCats, _parms._k, hasWeightCol()) .doAll(vecs); // Pick the max categorical level for cluster center max_cats(task._cMeans, task._cats, _isCats); // Handle the case where some centers go dry. Rescue only 1 cluster // per iteration ('cause we only tracked the 1 worst row) if (cleanupBadClusters(task, vecs, centers, means, mults, impute_cat)) continue; // Compute model stats; update standardized cluster centers oldCenters = centers; centers = computeStatsFillModel(task, model, vecs, means, mults, impute_cat); model.update(_key); // Update model in K/V store update(1); // One unit of work if (model._parms._score_each_iteration) Log.info(model._output._model_summary); } Log.info(model._output._model_summary); // Log.info(model._output._scoring_history); // // Log.info(((ModelMetricsClustering)model._output._training_metrics).createCentroidStatsTable().toString()); // At the end: validation scoring (no need to gather scoring history) if (_valid != null) { model.score(_parms.valid()).delete(); // this appends a ModelMetrics on the validation set model._output._validation_metrics = ModelMetrics.getFromDKV(model, _parms.valid()); model.update(_key); // Update model in K/V store } done(); // Job done! } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { updateModelOutput(); if (model != null) model.unlock(_key); _parms.read_unlock_frames(KMeans.this); } tryComplete(); }
@Override public Vec vresponse() { if (_vresponse_key == null) return response(); return _vresponse != null ? _vresponse : (_vresponse = DKV.getGet(_vresponse_key)); }
@Override public Vec response() { return _response == null ? (_response = DKV.getGet(_response_key)) : _response; }
@Override protected void compute2() { _model = null; // Resulting model! try { Scope.enter(); // Cleanup temp keys init(true); // Do any expensive tests & conversions now // Do lock even before checking the errors, since this block is finalized by unlock // (not the best solution, but the code is more readable) _parms.read_lock_frames(SharedTree.this); // Fetch & read-lock input frames if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this); // New Model? Or continuing from a checkpoint? if (_parms._checkpoint && DKV.get(_parms._model_id) != null) { _model = DKV.get(_dest).get(); _model.write_lock(_key); // do not delete previous model; we are extending it } else { // New Model // Compute the zero-tree error - guessing only the class distribution. // MSE is stddev squared when guessing for regression. // For classification, guess the largest class. _model = makeModel( _dest, _parms, initial_MSE(_response, _response), _valid == null ? Double.NaN : initial_MSE(_response, _vresponse)); // Make a fresh model _model.delete_and_lock(_key); // and clear & write-lock it (smashing any prior) _model._output._init_f = _initialPrediction; } // Compute the response domain; makes for nicer printouts String[] domain = _response.domain(); assert (_nclass > 1 && domain != null) || (_nclass == 1 && domain == null); if (_nclass == 1) domain = new String[] {"r"}; // For regression, give a name to class 0 // Compute class distribution, used to for initial guesses and to // upsample minority classes (if asked for). if (_nclass > 1) { // Classification? // Handle imbalanced classes by stratified over/under-sampling. // initWorkFrame sets the modeled class distribution, and // model.score() corrects the probabilities back using the // distribution ratios if (_model._output.isClassifier() && _parms._balance_classes) { float[] trainSamplingFactors = new float [_train .lastVec() .domain() .length]; // leave initialized to 0 -> will be filled up below if (_parms._class_sampling_factors != null) { if (_parms._class_sampling_factors.length != _train.lastVec().domain().length) throw new IllegalArgumentException( "class_sampling_factors must have " + _train.lastVec().domain().length + " elements"); trainSamplingFactors = _parms._class_sampling_factors.clone(); // clone: don't modify the original } Frame stratified = water.util.MRUtils.sampleFrameStratified( _train, _train.lastVec(), _train.vec(_model._output.weightsName()), trainSamplingFactors, (long) (_parms._max_after_balance_size * _train.numRows()), _parms._seed, true, false); if (stratified != _train) { _train = stratified; _response = stratified.vec(_parms._response_column); _weights = stratified.vec(_parms._weights_column); // Recompute distribution since the input frame was modified MRUtils.ClassDist cdmt2 = _weights != null ? new MRUtils.ClassDist(_nclass).doAll(_response, _weights) : new MRUtils.ClassDist(_nclass).doAll(_response); _model._output._distribution = cdmt2.dist(); _model._output._modelClassDist = cdmt2.rel_dist(); } } Log.info("Prior class distribution: " + Arrays.toString(_model._output._priorClassDist)); Log.info("Model class distribution: " + Arrays.toString(_model._output._modelClassDist)); } // Also add to the basic working Frame these sets: // nclass Vecs of current forest results (sum across all trees) // nclass Vecs of working/temp data // nclass Vecs of NIDs, allowing 1 tree per class // Current forest values: results of summing the prior M trees for (int i = 0; i < _nclass; i++) _train.add("Tree_" + domain[i], _response.makeZero()); // Initial work columns. Set-before-use in the algos. for (int i = 0; i < _nclass; i++) _train.add("Work_" + domain[i], _response.makeZero()); // One Tree per class, each tree needs a NIDs. For empty classes use a -1 // NID signifying an empty regression tree. for (int i = 0; i < _nclass; i++) _train.add( "NIDs_" + domain[i], _response.makeCon( _model._output._distribution == null ? 0 : (_model._output._distribution[i] == 0 ? -1 : 0))); // Tag out rows missing the response column new ExcludeNAResponse().doAll(_train); // Variable importance: squared-error-improvement-per-variable-per-split _improvPerVar = new float[_ncols]; // Sub-class tree-model-builder specific build code buildModel(); done(); // Job done! } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { if (_model != null) _model.unlock(_key); _parms.read_unlock_frames(SharedTree.this); if (_model == null) Scope.exit(); else { Scope.exit( _model._key, ModelMetrics.buildKey(_model, _parms.train()), ModelMetrics.buildKey(_model, _parms.valid())); } } tryComplete(); }