@Override protected Response serve() { try { // pull everything local Log.info("ExportFiles processing (" + path + ")"); if (DKV.get(src_key) == null) throw new IllegalArgumentException(src_key.toString() + " not found."); Object value = DKV.get(src_key).get(); // create a stream to read the entire VA or Frame if (!(value instanceof ValueArray) && !(value instanceof Frame)) throw new UnsupportedOperationException("Can only export Frames or ValueArrays."); InputStream csv = value instanceof ValueArray ? new ValueArray.CsvVAStream((ValueArray) value, null) : ((Frame) value).toCSV(true); String p2 = path.toLowerCase(); if (p2.startsWith("hdfs://")) serveHdfs(csv); else if (p2.startsWith("s3n://")) serveHdfs(csv); else serveLocalDisk(csv); return RequestBuilders.Response.done(this); } catch (Throwable t) { return RequestBuilders.Response.error(t); } }
static Frame exec_str(String str, String id) { Val val = Exec.exec(str); switch (val.type()) { case Val.FRM: Frame fr = val.getFrame(); Key k = Key.make(id); // Smart delete any prior top-level result Iced i = DKV.getGet(k); if (i instanceof Lockable) ((Lockable) i).delete(); else if (i instanceof Keyed) ((Keyed) i).remove(); else if (i != null) throw new IllegalArgumentException("Attempting to overright an unexpected key"); DKV.put(fr = new Frame(k, fr._names, fr.vecs())); System.out.println(fr); checkSaneFrame(); return fr; case Val.NUM: System.out.println("num= " + val.getNum()); assert id == null; checkSaneFrame(); return null; case Val.STR: System.out.println("str= " + val.getStr()); assert id == null; checkSaneFrame(); return null; default: throw water.H2O.fail(); } }
/** * Score a frame with the given model and return just the metrics. * * <p>NOTE: ModelMetrics are now always being created by model.score. . . */ @SuppressWarnings("unused") // called through reflection by RequestServer public ModelMetricsListSchemaV3 score(int version, ModelMetricsListSchemaV3 s) { // parameters checking: if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model); if (null == DKV.get(s.model.name)) throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name); if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame); if (null == DKV.get(s.frame.name)) throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name); ModelMetricsList parms = s.createAndFillImpl(); parms ._model .score(parms._frame, parms._predictions_name) .remove(); // throw away predictions, keep metrics as a side-effect ModelMetricsListSchemaV3 mm = this.fetch(version, s); // TODO: for now only binary predictors write an MM object. // For the others cons one up here to return the predictions frame. if (null == mm) mm = new ModelMetricsListSchemaV3(); if (null == mm.model_metrics || 0 == mm.model_metrics.length) { Log.warn( "Score() did not return a ModelMetrics for model: " + s.model + " on frame: " + s.frame); } return mm; }
public GLMModelV3 make_model(int version, MakeGLMModelV3 args) { GLMModel model = DKV.getGet(args.model.key()); if (model == null) throw new IllegalArgumentException("missing source model " + args.model); String[] names = model._output.coefficientNames(); Map<String, Double> coefs = model.coefficients(); for (int i = 0; i < args.names.length; ++i) coefs.put(args.names[i], args.beta[i]); double[] beta = model.beta().clone(); for (int i = 0; i < beta.length; ++i) beta[i] = coefs.get(names[i]); GLMModel m = new GLMModel( args.dest != null ? args.dest.key() : Key.make(), model._parms, null, new double[] {.5}, Double.NaN, Double.NaN, -1); DataInfo dinfo = model.dinfo(); dinfo.setPredictorTransform(TransformType.NONE); // GLMOutput(DataInfo dinfo, String[] column_names, String[][] domains, String[] // coefficient_names, boolean binomial) { m._output = new GLMOutput( model.dinfo(), model._output._names, model._output._domains, model._output.coefficientNames(), model._output._binomial, beta); DKV.put(m._key, m); GLMModelV3 res = new GLMModelV3(); res.fillFromImpl(m); return res; }
@Override public float progress() { if (DKV.get(dest()) == null) return 0; GLMModel m = DKV.get(dest()).get(); float progress = (float) m.iteration() / (float) max_iter; // TODO, do something smarter here return progress; }
/** Score a frame with the given model and return the metrics AND the prediction frame. */ @SuppressWarnings("unused") // called through reflection by RequestServer public JobV3 predict2(int version, final ModelMetricsListSchemaV3 s) { // parameters checking: if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model); if (null == DKV.get(s.model.name)) throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name); if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame); if (null == DKV.get(s.frame.name)) throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name); final ModelMetricsList parms = s.createAndFillImpl(); // predict2 does not return modelmetrics, so cannot handle deeplearning: reconstruction_error // (anomaly) or GLRM: reconstruct and archetypes // predict2 can handle deeplearning: deepfeatures and predict if (s.deep_features_hidden_layer > 0) { if (null == parms._predictions_name) parms._predictions_name = "deep_features" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); } else if (null == parms._predictions_name) parms._predictions_name = "predictions" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); final Job<Frame> j = new Job(Key.make(parms._predictions_name), Frame.class.getName(), "prediction"); H2O.H2OCountedCompleter work = new H2O.H2OCountedCompleter() { @Override public void compute2() { if (s.deep_features_hidden_layer < 0) { parms._model.score(parms._frame, parms._predictions_name, j); } else { Frame predictions = ((Model.DeepFeatures) parms._model) .scoreDeepFeatures(parms._frame, s.deep_features_hidden_layer, j); predictions = new Frame( Key.make(parms._predictions_name), predictions.names(), predictions.vecs()); DKV.put(predictions._key, predictions); } tryComplete(); } }; j.start(work, parms._frame.anyVec().nChunks()); return new JobV3().fillFromImpl(j); }
public Vec replace(int col, Vec nv) { assert col < _names.length; Vec rv = vecs()[col]; assert rv.group().equals(nv.group()); _vecs[col] = nv; _keys[col] = nv._key; if (DKV.get(nv._key) == null) // If not already in KV, put it there DKV.put(nv._key, nv); return rv; }
public static Response redirect(JsonObject fromPageResponse, Key rfModelKey) { RFModel rfModel = DKV.get(rfModelKey).get(); ValueArray data = DKV.get(rfModel._dataKey).get(); return redirect( fromPageResponse, null, rfModelKey, rfModel._dataKey, rfModel._totalTrees, data.numCols() - 1, null, true, false); }
/** * Creates the value header based on the calculated columns. * * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed * dataset. */ private void createValueArrayHeader() { assert (_phase == Pass.TWO); Column[] cols = new Column[_ncolumns]; int off = 0; for (int i = 0; i < cols.length; ++i) { cols[i] = new Column(); cols[i]._n = _numRows - _invalidValues[i]; cols[i]._base = _bases[i]; assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i]) : "scale out of bounds!, col = " + i + ", scale = " + _scale[i]; cols[i]._scale = (char) pow10i(-_scale[i]); cols[i]._off = (char) off; cols[i]._size = (byte) COL_SIZES[_colTypes[i]]; cols[i]._domain = _colDomains[i]; cols[i]._max = _max[i]; cols[i]._min = _min[i]; cols[i]._mean = _mean[i]; cols[i]._sigma = _sigma[i]; cols[i]._name = _colNames[i]; off += Math.abs(cols[i]._size); } // let any pending progress reports finish DKV.write_barrier(); // finally make the value array header ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols); UKV.put(_resultKey, ary.value()); }
// ------------------------------------------------------------------------ // Zipped file; no parallel decompression; decompress into local chunks, // parse local chunks; distribute chunks later. ParseWriter streamParseZip(final InputStream is, final StreamParseWriter dout, InputStream bvs) throws IOException { // All output into a fresh pile of NewChunks, one per column if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl(); StreamData din = new StreamData(is); int cidx = 0; StreamParseWriter nextChunk = dout; int zidx = bvs.read(null, 0, 0); // Back-channel read of chunk index assert zidx == 1; while (is.available() > 0) { int xidx = bvs.read(null, 0, 0); // Back-channel read of chunk index if (xidx > zidx) { // Advanced chunk index of underlying ByteVec stream? zidx = xidx; // Record advancing of chunk nextChunk.close(); // Match output chunks to input zipfile chunks if (dout != nextChunk) { dout.reduce(nextChunk); if (_jobKey != null && ((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed()) break; } nextChunk = nextChunk.nextChunk(); } parseChunk(cidx++, din, nextChunk); } parseChunk(cidx, din, nextChunk); // Parse the remaining partial 32K buffer nextChunk.close(); if (dout != nextChunk) dout.reduce(nextChunk); return dout; }
@Override protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) { Frame adaptFrm = new Frame(adaptedFr); for (int i = 0; i < _parms._k; i++) adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero()); new MRTask() { @Override public void map(Chunk chks[]) { double tmp[] = new double[_output._names.length]; double preds[] = new double[_parms._k]; for (int row = 0; row < chks[0]._len; row++) { double p[] = score0(chks, row, tmp, preds); for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]); } } }.doAll(adaptFrm); // Return the projection into principal component space int x = _output._names.length, y = adaptFrm.numCols(); Frame f = adaptFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame( (null == destination_key ? Key.make() : Key.make(destination_key)), f.names(), f.vecs()); DKV.put(f); makeMetricBuilder(null).makeModelMetrics(this, orig); return f; }
// Delete the metrics that match model and/or frame ModelMetricsList delete() { ModelMetricsList matches = fetch(); for (ModelMetrics mm : matches._model_metrics) DKV.remove(mm._key); return matches; }
// Fetch all metrics that match model and/or frame ModelMetricsList fetch() { final Key[] modelMetricsKeys = KeySnapshot.globalSnapshot() .filter( new KeySnapshot.KVFilter() { @Override public boolean filter(KeySnapshot.KeyInfo k) { try { if (!Value.isSubclassOf(k._type, ModelMetrics.class)) return false; // Fast-path cutout ModelMetrics mm = DKV.getGet(k._key); // If we're filtering by model filter by Model. :-) if (_model != null && !mm.isForModel((Model) DKV.getGet(_model._key))) return false; // If we're filtering by frame filter by Frame. :-) if (_frame != null && !mm.isForFrame((Frame) DKV.getGet(_frame._key))) return false; } catch (NullPointerException | ClassCastException ex) { return false; // Handle all kinds of broken racey key updates } return true; } }) .keys(); _model_metrics = new ModelMetrics[modelMetricsKeys.length]; for (int i = 0; i < modelMetricsKeys.length; i++) _model_metrics[i] = DKV.getGet(modelMetricsKeys[i]); return this; // Flow coding }
// Convert a chunk# into a chunk - does lazy-chunk creation. As chunks are // asked-for the first time, we make the Key and an empty backing DVec. // Touching the DVec will force the file load. @Override public Value chunkIdx(int cidx) { final long nchk = nChunks(); assert 0 <= cidx && cidx < nchk; Key dkey = chunkKey(cidx); Value val1 = DKV.get(dkey); // Check for an existing one... will fetch data as needed if (val1 != null) return val1; // Found an existing one? // Lazily create a DVec for this chunk int len = (int) (cidx < nchk - 1 ? ValueArray.CHUNK_SZ : (_len - chunk2StartElem(cidx))); // DVec is just the raw file data with a null-compression scheme Value val2 = new Value(dkey, len, null, TypeMap.C1CHUNK, Value.NFS); val2.setdsk(); // It is already on disk. // Atomically insert: fails on a race, but then return the old version Value val3 = DKV.DputIfMatch(dkey, val2, null, null); return val3 == null ? val2 : val3; }
@Override public void modifyParmsForCrossValidationMainModel(ModelBuilder[] cvModelBuilders) { _parms._overwrite_with_best_model = false; if (_parms._stopping_rounds == 0 && _parms._max_runtime_secs == 0) return; // No exciting changes to stopping conditions // Extract stopping conditions from each CV model, and compute the best stopping answer _parms._stopping_rounds = 0; _parms._max_runtime_secs = 0; double sum = 0; for (ModelBuilder cvmb : cvModelBuilders) sum += ((DeepLearningModel) DKV.getGet(cvmb.dest())).last_scored().epoch_counter; _parms._epochs = sum / cvModelBuilders.length; if (!_parms._quiet_mode) { warn( "_epochs", "Setting optimal _epochs to " + _parms._epochs + " for cross-validation main model based on early stopping of cross-validation models."); warn( "_stopping_rounds", "Disabling convergence-based early stopping for cross-validation main model."); warn( "_max_runtime_secs", "Disabling maximum allowed runtime for cross-validation main model."); } }
@Override public Response serve() { Frame fr = DKV.get(data_key.value()).get(); if (fr == null) return RequestServer._http404.serve(); // Build a frame with the selected Vecs Frame fr2 = new Frame(new String[0], new Vec[0]); int[] idxs = vecs.value(); for (int idx : idxs) // The selected frame columns fr2.add(fr._names[idx], fr._vecs[idx]); // Add the class-vec last Vec cvec = class_vec.value(); fr2.add(fr._names[class_vec._colIdx.get()], cvec); domain = cvec.domain(); // Class/enum/factor names mtrys = features.value() == null ? (int) (Math.sqrt(idxs.length) + 0.5) : features.value(); DRF drf = DRF.start( DRF.makeKey(), fr2, depth.value(), ntrees.value(), mtrys, sample_rate.value(), seed.value()); drf.get(); // Block for result cm = drf.cm(); // Get CM result return new Response(Response.Status.done, this, -1, -1, null); }
@Test public void testDomains() { Frame frame = parse_test_file("smalldata/junit/weather.csv"); for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) { Vec v = frame.vec(s); Vec newV = v.toCategoricalVec(); frame.remove(s); frame.add(s, newV); v.remove(); } DKV.put(frame); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 10; AggregatorModel agg = new Aggregator(parms).trainModel().get(); Frame output = agg._output._output_frame.get(); Assert.assertTrue(output.numRows() < 0.5 * frame.numRows()); boolean same = true; for (int i = 0; i < frame.numCols(); ++i) { if (frame.vec(i).isCategorical()) { same = (frame.domains()[i].length == output.domains()[i].length); if (!same) break; } } frame.remove(); output.remove(); agg.remove(); Assert.assertFalse(same); }
public Frame(String[] names, Vec[] vecs) { // assert names==null || names.length == vecs.length : "Number of columns does not match to // number of cols' names."; _names = names; _vecs = vecs; _keys = new Key[vecs.length]; for (int i = 0; i < vecs.length; i++) { Key k = _keys[i] = vecs[i]._key; if (DKV.get(k) == null) // If not already in KV, put it there DKV.put(k, vecs[i]); } Vec v0 = anyVec(); if (v0 == null) return; VectorGroup grp = v0.group(); for (int i = 0; i < vecs.length; i++) assert grp.equals(vecs[i].group()); }
protected void testExecFail(String expr, int errorPos) { DKV.write_barrier(); int keys = H2O.store_size(); try { int i = UNIQUE.getAndIncrement(); System.err.println("result" + (new Integer(i).toString()) + ": " + expr); Key key = Exec.exec(expr, "result" + (new Integer(i).toString())); UKV.remove(key); assertTrue("An exception should have been thrown.", false); } catch (ParserException e) { assertTrue(false); } catch (EvaluationException e) { if (errorPos != -1) assertEquals(errorPos, e._pos); } DKV.write_barrier(); assertEquals("Keys were not properly deleted for expression " + expr, keys, H2O.store_size()); }
public JobsV3 cancel(int version, JobsV3 c) { Job j = DKV.getGet(c.job_id.key()); if (j == null) { throw new IllegalArgumentException("No job with key " + c.job_id.key()); } j.stop(); // Request Job stop return c; }
public static ValueArray loadAndParseKey(Key okey, String path) { FileIntegrityChecker c = FileIntegrityChecker.check(new File(path),false); Key k = c.syncDirectory(null,null,null,null); ParseDataset.forkParseDataset(okey, new Key[] { k }, null).get(); UKV.remove(k); ValueArray res = DKV.get(okey).get(); return res; }
@Override public byte[] atomic(byte[] bits1) { byte[] mem = DKV.get(_key).get(); int len = Math.max(_dst_off + mem.length, bits1 == null ? 0 : bits1.length); byte[] bits2 = MemoryManager.malloc1(len); if (bits1 != null) System.arraycopy(bits1, 0, bits2, 0, bits1.length); System.arraycopy(mem, 0, bits2, _dst_off, mem.length); return bits2; }
private FrameTask(Key jobKey, Key dinfoKey, int[] activeCols, long seed, int iteration) { super(null); assert dinfoKey == null || DKV.get(dinfoKey) != null; _jobKey = jobKey; _dinfoKey = dinfoKey; _activeCols = activeCols; _seed = seed; _iteration = iteration; }
/** Actually remove/delete all Vecs from memory, not just from the Frame. */ public void remove(Futures fs) { if (_vecs.length > 0) { VectorGroup vg = _vecs[0].group(); for (Vec v : _vecs) UKV.remove(v._key, fs); DKV.remove(vg._key); } _names = new String[0]; _vecs = new Vec[0]; }
@Test public void testCategoricalProstate() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS try { Scope.enter(); train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 8; parms._gamma_x = parms._gamma_y = 0.1; parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._init = GLRM.Initialization.PlusPlus; parms._transform = DataInfo.TransformType.STANDARDIZE; parms._recover_svd = false; parms._max_iterations = 200; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
// Close all AppendableVec public Futures closeAppendables(Futures fs) { _col0 = null; // Reset cache int len = vecs().length; for (int i = 0; i < len; i++) { Vec v = _vecs[i]; if (v instanceof AppendableVec) DKV.put(_keys[i], _vecs[i] = ((AppendableVec) v).close(fs), fs); } return fs; }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to * be fast; heavy-weight prep needs to wait for the trainModel() call. * * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to * predict on; validate a checkpoint. */ @Override public void init(boolean expensive) { super.init(expensive); if (H2O.ARGS.client && _parms._build_tree_one_node) error("_build_tree_one_node", "Cannot run on a single node in client mode"); if (_vresponse != null) _vresponse_key = _vresponse._key; if (_response != null) _response_key = _response._key; if (_nclass > SharedTreeModel.SharedTreeParameters.MAX_SUPPORTED_LEVELS) error("_nclass", "Too many levels in response column!"); if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0"); if (_parms._ntrees < 0 || _parms._ntrees > 100000) error("_ntrees", "Requested ntrees must be between 1 and 100000"); _ntrees = _parms._ntrees; // Total trees in final model if (_parms._checkpoint) { // Asking to continue from checkpoint? Value cv = DKV.get(_parms._model_id); if (cv != null) { // Look for prior model M checkpointModel = cv.get(); if (_parms._ntrees < checkpointModel._output._ntrees + 1) error( "_ntrees", "Requested ntrees must be between " + checkpointModel._output._ntrees + 1 + " and 100000"); _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees } } if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1."); if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16)); if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1."); if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16)); if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0."); if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0."); if (_parms._distribution == Distributions.Family.tweedie) { _parms._distribution.tweedie.p = _parms._tweedie_power; } if (_train != null) { double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1); if (sumWeights < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once error( "_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows + ": must have at least " + 2 * _parms._min_rows + " (weighted) rows, but have only " + sumWeights + "."); } if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols(); }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to * be fast; heavy-weight prep needs to wait for the trainModel() call. * * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to * predict on; validate a checkpoint. */ @Override public void init(boolean expensive) { super.init(expensive); if (H2O.ARGS.client && _parms._build_tree_one_node) error("_build_tree_one_node", "Cannot run on a single node in client mode"); if (_vresponse != null) _vresponse_key = _vresponse._key; if (_response != null) _response_key = _response._key; if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0"); if (_parms._ntrees < 0 || _parms._ntrees > MAX_NTREES) error("_ntrees", "Requested ntrees must be between 1 and " + MAX_NTREES); _ntrees = _parms._ntrees; // Total trees in final model if (_parms.hasCheckpoint()) { // Asking to continue from checkpoint? Value cv = DKV.get(_parms._checkpoint); if (cv != null) { // Look for prior model M checkpointModel = cv.get(); try { _parms.validateWithCheckpoint(checkpointModel._parms); } catch (H2OIllegalArgumentException e) { error(e.values.get("argument").toString(), e.values.get("value").toString()); } if (_parms._ntrees < checkpointModel._output._ntrees + 1) error( "_ntrees", "If checkpoint is specified then requested ntrees must be higher than " + (checkpointModel._output._ntrees + 1)); // Compute number of trees to build for this checkpoint _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees } } if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1."); if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16)); if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1."); if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16)); if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0."); if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0."); if (_train != null) { double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1); if (sumWeights < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once error( "_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows + ": must have at least " + 2 * _parms._min_rows + " (weighted) rows, but have only " + sumWeights + "."); } if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols(); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { QuantileModel.QuantileParameters parms = new QuantileModel.QuantileParameters(); Frame fr = stk.track(asts[1].exec(env)).getFrame(); Frame fr_wkey = new Frame(fr); // Force a bogus Key for Quantiles ModelBuilder DKV.put(fr_wkey); parms._train = fr_wkey._key; parms._probs = ((ASTNumList) asts[2]).expand(); for (double d : parms._probs) if (d < 0 || d > 1) throw new IllegalArgumentException("Probability must be between 0 and 1: " + d); String inter = asts[3].exec(env).getStr(); parms._combine_method = QuantileModel.CombineMethod.valueOf(inter.toUpperCase()); parms._weights_column = asts[4].str().equals("_") ? null : asts[4].str(); // Compute Quantiles QuantileModel q = new Quantile(parms).trainModel().get(); // Remove bogus Key DKV.remove(fr_wkey._key); // Reshape all outputs as a Frame, with probs in col 0 and the // quantiles in cols 1 thru fr.numCols() - except the optional weights vec int ncols = fr.numCols(); if (parms._weights_column != null) ncols--; Vec[] vecs = new Vec[1 /*1 more for the probs themselves*/ + ncols]; String[] names = new String[vecs.length]; vecs[0] = Vec.makeCon(null, parms._probs); names[0] = "Probs"; int w = 0; for (int i = 0; i < vecs.length - 1; ++i) { if (fr._names[i].equals(parms._weights_column)) w = 1; assert (w == 0 || w == 1); vecs[i + 1] = Vec.makeCon(null, q._output._quantiles[i]); names[i + 1] = fr._names[w + i] + "Quantiles"; } q.delete(); return new ValFrame(new Frame(names, vecs)); }