@SuppressWarnings("unused") // called through reflection by RequestServer public RemoveAllV3 remove(int version, RemoveAllV3 u) { Log.info("Removing all objects"); Futures fs = new Futures(); for (Job j : Job.jobs()) { j.cancel(); j.remove(fs); } fs.blockForPending(); // Bulk brainless key removal. Completely wipes all Keys without regard. new MRTask() { @Override public byte priority() { return H2O.GUI_PRIORITY; } @Override public void setupLocal() { H2O.raw_clear(); water.fvec.Vec.ESPC.clear(); } }.doAllNodes(); Log.info("Finished removing objects"); return u; }
@Override protected Response serve() { init(); link = family.defaultLink; // TODO tweedie_link_power = 1 - tweedie_variance_power; // TODO _glm = new GLMParams(family, tweedie_variance_power, link, tweedie_link_power); if (alpha.length > 1) { // grid search if (destination_key == null) destination_key = Key.make("GLMGridResults_" + Key.make()); if (job_key == null) job_key = Key.make((byte) 0, Key.JOB, H2O.SELF); ; Job j = gridSearch( self(), destination_key, _dinfo, _glm, lambda, lambda_search, alpha, higher_accuracy, n_folds); return GLMGridView.redirect(this, j.dest()); } else { if (destination_key == null) destination_key = Key.make("GLMModel_" + Key.make()); if (job_key == null) job_key = Key.make("GLM2Job_" + Key.make()); fork(); return GLMProgress.redirect(this, job_key, dest()); } }
/** Score a frame with the given model and return the metrics AND the prediction frame. */ @SuppressWarnings("unused") // called through reflection by RequestServer public JobV3 predict2(int version, final ModelMetricsListSchemaV3 s) { // parameters checking: if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model); if (null == DKV.get(s.model.name)) throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name); if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame); if (null == DKV.get(s.frame.name)) throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name); final ModelMetricsList parms = s.createAndFillImpl(); // predict2 does not return modelmetrics, so cannot handle deeplearning: reconstruction_error // (anomaly) or GLRM: reconstruct and archetypes // predict2 can handle deeplearning: deepfeatures and predict if (s.deep_features_hidden_layer > 0) { if (null == parms._predictions_name) parms._predictions_name = "deep_features" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); } else if (null == parms._predictions_name) parms._predictions_name = "predictions" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); final Job<Frame> j = new Job(Key.make(parms._predictions_name), Frame.class.getName(), "prediction"); H2O.H2OCountedCompleter work = new H2O.H2OCountedCompleter() { @Override public void compute2() { if (s.deep_features_hidden_layer < 0) { parms._model.score(parms._frame, parms._predictions_name, j); } else { Frame predictions = ((Model.DeepFeatures) parms._model) .scoreDeepFeatures(parms._frame, s.deep_features_hidden_layer, j); predictions = new Frame( Key.make(parms._predictions_name), predictions.names(), predictions.vecs()); DKV.put(predictions._key, predictions); } tryComplete(); } }; j.start(work, parms._frame.anyVec().nChunks()); return new JobV3().fillFromImpl(j); }
public JobsV3 cancel(int version, JobsV3 c) { Job j = DKV.getGet(c.job_id.key()); if (j == null) { throw new IllegalArgumentException("No job with key " + c.job_id.key()); } j.stop(); // Request Job stop return c; }
@Override public boolean toHTML(StringBuilder sb) { Job jjob = Job.findJob(job_key); DRFModel m = UKV.get(jjob.dest()); if (m != null) m.generateHTML("DRF Model", sb); else DocGen.HTML.paragraph(sb, "Pending..."); return true; }
/** Finds a job with given dest key or returns null */ public static final Job findJobByDest(final Key destKey) { Job job = null; for( Job current : Job.all() ) { if( current.dest().equals(destKey) ) { job = current; break; } } return job; }
/** Finds a job with given key or returns null */ public static final Job findJob(final Key key) { Job job = null; for (Job current : Job.all()) { if (current.self().equals(key)) { job = current; break; } } return job; }
// Expand grid search related argument sets @Override protected NanoHTTPD.Response serveGrid(NanoHTTPD server, Properties parms, RequestType type) { String[][] values = new String[_arguments.size()][]; boolean gridSearch = false; for (int i = 0; i < _arguments.size(); i++) { Argument arg = _arguments.get(i); if (arg._gridable) { String value = _parms.getProperty(arg._name); if (value != null) { // Skips grid if argument is an array, except if imbricated expression // Little hackish, waiting for real language boolean imbricated = value.contains("("); if (!arg._field.getType().isArray() || imbricated) { values[i] = split(value); if (values[i] != null && values[i].length > 1) gridSearch = true; } else if (arg._field.getType().isArray() && !imbricated) { // Copy values which are arrays values[i] = new String[] {value}; } } } } if (!gridSearch) return superServeGrid(server, parms, type); // Ignore destination key so that each job gets its own _parms.remove("destination_key"); for (int i = 0; i < _arguments.size(); i++) if (_arguments.get(i)._name.equals("destination_key")) values[i] = null; // Iterate over all argument combinations int[] counters = new int[values.length]; ArrayList<Job> jobs = new ArrayList<Job>(); for (; ; ) { Job job = (Job) create(_parms); Properties combination = new Properties(); for (int i = 0; i < values.length; i++) { if (values[i] != null) { String value = values[i][counters[i]]; value = value.trim(); combination.setProperty(_arguments.get(i)._name, value); _arguments.get(i).reset(); _arguments.get(i).check(job, value); } } job._parms = combination; jobs.add(job); if (!increment(counters, values)) break; } GridSearch grid = new GridSearch(); grid.jobs = jobs.toArray(new Job[jobs.size()]); return grid.superServeGrid(server, parms, type); }
// @Ignore("PUBDEV-1643") @Test public void testDuplicatesCarsGrid() { Grid grid = null; Frame fr = null; Vec old = null; try { fr = parse_test_file("smalldata/junit/cars_20mpg.csv"); fr.remove("name").remove(); // Remove unique id old = fr.remove("economy"); fr.add("economy", old); // response to last column DKV.put(fr); // Setup random hyperparameter search space HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() { { put("_ntrees", new Integer[] {5, 5}); put("_max_depth", new Integer[] {2, 2}); put("_mtries", new Integer[] {-1, -1}); put("_sample_rate", new Double[] {.1, .1}); } }; // Fire off a grid search DRFModel.DRFParameters params = new DRFModel.DRFParameters(); params._train = fr._key; params._response_column = "economy"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = gs.get(); // Check that duplicate model have not been constructed Model[] models = grid.getModels(); assertTrue("Number of returned models has to be > 0", models.length > 0); // But all off them should be same Key<Model> modelKey = models[0]._key; for (Model m : models) { assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key); } } finally { if (old != null) { old.remove(); } if (fr != null) { fr.remove(); } if (grid != null) { grid.remove(); } } }
/** * Block synchronously waiting for a job to end, success or not. * @param jobkey Job to wait for. * @param pollingIntervalMillis Polling interval sleep time. */ public static void waitUntilJobEnded(Key jobkey, int pollingIntervalMillis) { while (true) { if (Job.isEnded(jobkey)) { return; } try { Thread.sleep (pollingIntervalMillis); } catch (Exception _) {} } }
@Override protected Response serve() { init(); link = family.defaultLink; // TODO tweedie_link_power = 1 - tweedie_variance_power; // TODO Frame fr = DataInfo.prepareFrame(source, response, ignored_cols, family == Family.binomial, true); _dinfo = new DataInfo(fr, 1, standardize); _glm = new GLMParams(family, tweedie_variance_power, link, tweedie_link_power); if (alpha.length > 1) { // grid search if (destination_key == null) destination_key = Key.make("GLMGridModel_" + Key.make()); if (job_key == null) job_key = Key.make("GLMGridJob_" + Key.make()); Job j = gridSearch(self(), destination_key, _dinfo, _glm, lambda, alpha, n_folds); return GLMGridView.redirect(this, j.dest()); } else { if (destination_key == null) destination_key = Key.make("GLMModel_" + Key.make()); if (job_key == null) job_key = Key.make("GLM2Job_" + Key.make()); fork(); return GLMProgress.redirect(this, job_key, dest()); } }
@Override public void lcompute() { // Optional: cancel all jobs // for (Job job : Job.all()) { // job.cancel(); // Job.waitUntilJobEnded(job.self()); // } final Set<Key> keySet = H2O.globalKeySet(null); for (Key key : keySet) { if (!key.home()) continue; // only unlock local keys final Value val = DKV.get(key); if (val == null) continue; if (val.rawPOJO() == null) continue; // need to have a POJO to be locked if (!val.isLockable()) continue; final Object obj = val.rawPOJO(); assert (obj instanceof Lockable<?>); final Lockable<?> lockable = (Lockable<?>) (obj); final Key[] lockers = ((Lockable) obj)._lockers; if (lockers != null) { // check that none of the locking jobs is still running for (Key locker : lockers) { if (locker != null && locker.type() == Key.JOB) { final Job job = UKV.get(locker); if (job != null && job.isRunning()) throw new UnsupportedOperationException( "Cannot unlock all keys since locking jobs are still running."); } } lockable.unlock_all(); Log.info("Unlocked key '" + key + "' from " + lockers.length + " lockers."); } } Log.info("All keys are now unlocked."); tryComplete(); }
// Start by splitting all the data according to some criteria (minimize // variance at the leaves). Record on each row which split it goes to, and // assign a split number to it (for next pass). On *this* pass, use the // split-number to build a per-split histogram, with a per-histogram-bucket // variance. @Override protected GBMModel buildModel( GBMModel model, final Frame fr, String names[], String domains[][], String[] cmDomain, Timer t_build) { // Tag out rows missing the response column new ExcludeNAResponse().doAll(fr); // Build trees until we hit the limit int tid; DTree[] ktrees = null; // Trees TreeStats tstats = new TreeStats(); // Tree stats for (tid = 0; tid < ntrees; tid++) { // During first iteration model contains 0 trees, then 0-trees, then 1-tree,... // BUT if validation is not specified model does not participate in voting // but on-the-fly computed data are used model = doScoring(model, fr, ktrees, tid, cmDomain, tstats, false, false, false); // ESL2, page 387 // Step 2a: Compute prediction (prob distribution) from prior tree results: // Work <== f(Tree) new ComputeProb().doAll(fr); // ESL2, page 387 // Step 2b i: Compute residuals from the prediction (probability distribution) // Work <== f(Work) new ComputeRes().doAll(fr); // ESL2, page 387, Step 2b ii, iii, iv Timer kb_timer = new Timer(); ktrees = buildNextKTrees(fr); Log.info(Sys.GBM__, (tid + 1) + ". tree was built in " + kb_timer.toString()); if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore // Check latest predictions tstats.updateBy(ktrees); } // Final scoring model = doScoring(model, fr, ktrees, tid, cmDomain, tstats, true, false, false); return model; }
@Override protected DRFModel buildModel( DRFModel model, final Frame fr, String names[], String domains[][], final Timer t_build) { // Append number of trees participating in on-the-fly scoring fr.add("OUT_BAG_TREES", response.makeZero()); // The RNG used to pick split columns Random rand = createRNG(_seed); // Prepare working columns new SetWrkTask().doAll(fr); int tid; DTree[] ktrees = null; // Prepare tree statistics TreeStats tstats = new TreeStats(); // Build trees until we hit the limit for (tid = 0; tid < ntrees; tid++) { // Building tid-tree model = doScoring( model, fr, ktrees, tid, tstats, tid == 0, !hasValidation(), build_tree_one_node); // At each iteration build K trees (K = nclass = response column domain size) // TODO: parallelize more? build more than k trees at each time, we need to care about // temporary data // Idea: launch more DRF at once. Timer kb_timer = new Timer(); ktrees = buildNextKTrees(fr, _mtry, sample_rate, rand, tid); Log.info(Sys.DRF__, (tid + 1) + ". tree was built " + kb_timer.toString()); if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore // Check latest predictions tstats.updateBy(ktrees); } model = doScoring(model, fr, ktrees, tid, tstats, true, !hasValidation(), build_tree_one_node); // Make sure that we did not miss any votes assert !importance || _treeMeasuresOnOOB.npredictors() == _treeMeasuresOnSOOB[0 /*variable*/].npredictors() : "Missing some tree votes in variable importance voting?!"; return model; }
/** * Impl class for a collection of jobs; only used in the API to make it easier to cons up the jobs * array via the magic of PojoUtils.copyProperties. */ @SuppressWarnings("unused") // called through reflection by RequestServer public JobsV3 list(int version, JobsV3 s) { Job[] jobs = Job.jobs(); // Jobs j = new Jobs(); // j._jobs = Job.jobs(); // PojoUtils.copyProperties(s, j, PojoUtils.FieldNaming.ORIGIN_HAS_UNDERSCORES); s.jobs = new JobV3[jobs.length]; int i = 0; for (Job j : jobs) { try { s.jobs[i] = (JobV3) Schema.schema(version, j).fillFromImpl(j); } // no special schema for this job subclass, so fall back to JobV3 catch (H2ONotFoundArgumentException e) { s.jobs[i] = new JobV3().fillFromImpl(j); } i++; // Java does the increment before the function call which throws?! } return s; }
/** * Creates a new ValueArray with classes. New ValueArray is not aligned with source one * unfortunately so have to send results to each chunk owner using Atomic. */ @Override public void map(Key key) { assert key.home(); if (Job.isRunning(_job.self())) { ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); long startRow = va.startRow(ValueArray.getChunkIndex(key)); int rows = va.rpc(ValueArray.getChunkIndex(key)); int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE); long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE); long updatedChk = chunk; long updatedRow = startRow; double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); int[] clusters = new int[rows]; int count = 0; for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE); if (chunk != updatedChk) { updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow); updatedChk = chunk; updatedRow = startRow + row; count = 0; } clusters[count++] = cd._cluster; } if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow); _job.updateProgress(1); } _job = null; _arykey = null; _cols = null; _clusters = null; }
@Override protected void onCancelled() { for (Job job : jobs) job.cancel(); }
// @Ignore("PUBDEV-1648") @Test public void testRandomCarsGrid() { Grid grid = null; DRFModel drfRebuilt = null; Frame fr = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); Vec old = fr.remove("economy (mpg)"); fr.add("economy (mpg)", old); // response to last column DKV.put(fr); // Setup random hyperparameter search space HashMap<String, Object[]> hyperParms = new HashMap<>(); // Construct random grid search space long seed = System.nanoTime(); Random rng = new Random(seed); // Limit to 1-3 randomly, 4 times. Average total number of models is // 2^4, or 16. Max is 81 models. Integer ntreesDim = rng.nextInt(3) + 1; Integer maxDepthDim = rng.nextInt(3) + 1; Integer mtriesDim = rng.nextInt(3) + 1; Integer sampleRateDim = rng.nextInt(3) + 1; Integer[] ntreesArr = interval(1, 15); ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr)); Collections.shuffle(ntreesList); Integer[] ntreesSpace = new Integer[ntreesDim]; for (int i = 0; i < ntreesDim; i++) { ntreesSpace[i] = ntreesList.get(i); } Integer[] maxDepthArr = interval(1, 10); ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr)); Collections.shuffle(maxDepthList); Integer[] maxDepthSpace = new Integer[maxDepthDim]; for (int i = 0; i < maxDepthDim; i++) { maxDepthSpace[i] = maxDepthList.get(i); } Integer[] mtriesArr = interval(1, 5); ArrayList<Integer> mtriesList = new ArrayList<>(Arrays.asList(mtriesArr)); Collections.shuffle(mtriesList); Integer[] mtriesSpace = new Integer[mtriesDim]; for (int i = 0; i < mtriesDim; i++) { mtriesSpace[i] = mtriesList.get(i); } Double[] sampleRateArr = interval(0.01, 0.99, 0.01); ArrayList<Double> sampleRateList = new ArrayList<>(Arrays.asList(sampleRateArr)); Collections.shuffle(sampleRateList); Double[] sampleRateSpace = new Double[sampleRateDim]; for (int i = 0; i < sampleRateDim; i++) { sampleRateSpace[i] = sampleRateList.get(i); } hyperParms.put("_ntrees", ntreesSpace); hyperParms.put("_max_depth", maxDepthSpace); hyperParms.put("_mtries", mtriesSpace); hyperParms.put("_sample_rate", sampleRateSpace); // Fire off a grid search DRFModel.DRFParameters params = new DRFModel.DRFParameters(); params._train = fr._key; params._response_column = "economy (mpg)"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = gs.get(); System.out.println("Test seed: " + seed); System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace)); System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace)); System.out.println("mtries search space: " + Arrays.toString(mtriesSpace)); System.out.println("sample_rate search space: " + Arrays.toString(sampleRateSpace)); // Check that cardinality of grid Model[] ms = grid.getModels(); int numModels = ms.length; System.out.println("Grid consists of " + numModels + " models"); assertEquals( "Number of models should match hyper space size", numModels, ntreesDim * maxDepthDim * sampleRateDim * mtriesDim + grid.getFailureCount()); // Pick a random model from the grid HashMap<String, Object[]> randomHyperParms = new HashMap<>(); Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)]; randomHyperParms.put("_ntrees", new Integer[] {ntreeVal}); Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)]; randomHyperParms.put("_max_depth", maxDepthSpace); Integer mtriesVal = mtriesSpace[rng.nextInt(mtriesSpace.length)]; randomHyperParms.put("_max_depth", mtriesSpace); Double sampleRateVal = sampleRateSpace[rng.nextInt(sampleRateSpace.length)]; randomHyperParms.put("_sample_rate", sampleRateSpace); // TODO: DRFModel drfFromGrid = (DRFModel) g2.model(randomHyperParms).get(); // Rebuild it with it's parameters params._ntrees = ntreeVal; params._max_depth = maxDepthVal; params._mtries = mtriesVal; drfRebuilt = new DRF(params).trainModel().get(); // Make sure the MSE metrics match // double fromGridMSE = drfFromGrid._output._scored_train[drfFromGrid._output._ntrees]._mse; double rebuiltMSE = drfRebuilt._output._scored_train[drfRebuilt._output._ntrees]._mse; // System.out.println("The random grid model's MSE: " + fromGridMSE); System.out.println("The rebuilt model's MSE: " + rebuiltMSE); // assertEquals(fromGridMSE, rebuiltMSE); } finally { if (fr != null) { fr.remove(); } if (grid != null) { grid.remove(); } if (drfRebuilt != null) { drfRebuilt.remove(); } } }
/** Return {@link Response} for finished job. */ @Override protected Response jobDone(final Job job, final Key dst) { JsonObject args = new JsonObject(); args.addProperty(MODEL_KEY, job.dest().toString()); return DRFModelView.redirect(this, job.dest()); }
// -------------------------------------------------------------------------- // Build the next k-trees, which is trying to correct the residual error from // the prior trees. From LSE2, page 387. Step 2b ii, iii. private DTree[] buildNextKTrees(Frame fr) { // We're going to build K (nclass) trees - each focused on correcting // errors for a single class. final DTree[] ktrees = new DTree[_nclass]; // Initial set of histograms. All trees; one leaf per tree (the root // leaf); all columns DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols]; for (int k = 0; k < _nclass; k++) { // Initially setup as-if an empty-split had just happened if (_distribution == null || _distribution[k] != 0) { // The Boolean Optimization // This optimization assumes the 2nd tree of a 2-class system is the // inverse of the first. This is false for DRF (and true for GBM) - // DRF picks a random different set of columns for the 2nd tree. if (k == 1 && _nclass == 2) continue; ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows); new GBMUndecidedNode( ktrees[k], -1, DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], false)); // The "root" node } } int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from here to tree._len // ---- // ESL2, page 387. Step 2b ii. // One Big Loop till the ktrees are of proper depth. // Adds a layer to the trees each pass. int depth = 0; for (; depth < max_depth; depth++) { if (!Job.isRunning(self())) return null; hcs = buildLayer(fr, ktrees, leafs, hcs, false, false); // If we did not make any new splits, then the tree is split-to-death if (hcs == null) break; } // Each tree bottomed-out in a DecidedNode; go 1 more level and insert // LeafNodes to hold predictions. for (int k = 0; k < _nclass; k++) { DTree tree = ktrees[k]; if (tree == null) continue; int leaf = leafs[k] = tree.len(); for (int nid = 0; nid < leaf; nid++) { if (tree.node(nid) instanceof DecidedNode) { DecidedNode dn = tree.decided(nid); for (int i = 0; i < dn._nids.length; i++) { int cnid = dn._nids[i]; if (cnid == -1 || // Bottomed out (predictors or responses known constant) tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth (tree.node(cnid) instanceof DecidedNode && // Or not possible to split ((DecidedNode) tree.node(cnid))._split.col() == -1)) dn._nids[i] = new GBMLeafNode(tree, nid).nid(); // Mark a leaf here } // Handle the trivial non-splitting tree if (nid == 0 && dn._split.col() == -1) new GBMLeafNode(tree, -1, 0); } } } // -- k-trees are done // ---- // ESL2, page 387. Step 2b iii. Compute the gammas, and store them back // into the tree leaves. Includes learn_rate. // gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|))) // For regression: // gamma_i_k = sum res_i / count(res_i) GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr); double m1class = _nclass > 1 ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; if (tree == null) continue; for (int i = 0; i < tree._len - leafs[k]; i++) { double g = gp._gss[k][i] == 0 // Constant response? ? (gp._rss[k][i] == 0 ? 0 : 1000) // Cap (exponential) learn, instead of dealing with Inf : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i]; assert !Double.isNaN(g); ((LeafNode) tree.node(leafs[k] + i))._pred = g; } } // ---- // ESL2, page 387. Step 2b iv. Cache the sum of all the trees, plus the // new tree, in the 'tree' columns. Also, zap the NIDs for next pass. // Tree <== f(Tree) // Nids <== 0 new MRTask2() { @Override public void map(Chunk chks[]) { // For all tree/klasses for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; if (tree == null) continue; final Chunk nids = chk_nids(chks, k); final Chunk ct = chk_tree(chks, k); for (int row = 0; row < nids._len; row++) { int nid = (int) nids.at80(row); if (nid < 0) continue; ct.set0(row, (float) (ct.at0(row) + ((LeafNode) tree.node(nid))._pred)); nids.set0(row, 0); } } } }.doAll(fr); // Collect leaves stats for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i]; // DEBUG: Print the generated K trees // printGenerateTrees(ktrees); return ktrees; }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_job != null && _job.self() != null && !Job.isRunning(_job.self())) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0]._start; chunkInit(); double[] nums = MemoryManager.malloc8d(_dinfo._nums); int[] cats = MemoryManager.malloc4(_dinfo._cats); double[] response = MemoryManager.malloc8d(_dinfo._responses); int start = 0; int end = nrows; boolean contiguous = false; Random skip_rng = null; // random generator for skipping rows if (_useFraction < 1.0) { skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong()); if (contiguous) { final int howmany = (int) Math.ceil(_useFraction * nrows); if (howmany > 0) { start = skip_rng.nextInt(nrows - howmany); end = start + howmany; } assert (start < nrows); assert (end <= nrows); } } long[] shuf_map = null; if (_shuffle) { shuf_map = new long[end - start]; for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i; Utils.shuffleArray(shuf_map, new Random().nextLong()); } OUTER: for (int rr = start; rr < end; ++rr) { final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr; if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId) || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue; for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs! int i = 0, ncats = 0; for (; i < _dinfo._cats; ++i) { int c = (int) chunks[i].at80(r); if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1; } final int n = chunks.length - _dinfo._responses; for (; i < n; ++i) { double d = chunks[i].at0(r); if (_dinfo._normMul != null) d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats]; nums[i - _dinfo._cats] = d; } for (i = 0; i < _dinfo._responses; ++i) { response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r); if (_dinfo._normRespMul != null) response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i]; } if (outputs != null && outputs.length > 0) processRow(offset + r, nums, ncats, cats, response, outputs); else processRow(offset + r, nums, ncats, cats, response); } chunkDone(); }
@Override public float progress() { double d = 0.1; for (Job job : jobs) d += job.progress(); return Math.min(1f, (float) (d / jobs.length)); }
@Override public void remove() { super.remove(); UKV.remove(_progress); }
// -------------------------------------------------------------------------- // Build the next random k-trees represeint tid-th tree private DTree[] buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random rand, int tid) { // We're going to build K (nclass) trees - each focused on correcting // errors for a single class. final DTree[] ktrees = new DTree[_nclass]; // Initial set of histograms. All trees; one leaf per tree (the root // leaf); all columns DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols]; // Use for all k-trees the same seed. NOTE: this is only to make a fair // view for all k-trees long rseed = rand.nextLong(); // Initially setup as-if an empty-split had just happened for (int k = 0; k < _nclass; k++) { assert (_distribution != null && classification) || (_distribution == null && !classification); if (_distribution == null || _distribution[k] != 0) { // Ignore missing classes // The Boolean Optimization // This optimization assumes the 2nd tree of a 2-class system is the // inverse of the first. This is false for DRF (and true for GBM) - // DRF picks a random different set of columns for the 2nd tree. // if( k==1 && _nclass==2 ) continue; ktrees[k] = new DRFTree(fr, _ncols, (char) nbins, (char) _nclass, min_rows, mtrys, rseed); boolean isBinom = classification; new DRFUndecidedNode( ktrees[k], -1, DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], isBinom)); // The "root" node } } // Sample - mark the lines by putting 'OUT_OF_BAG' into nid(<klass>) vector Timer t_1 = new Timer(); Sample ss[] = new Sample[_nclass]; for (int k = 0; k < _nclass; k++) if (ktrees[k] != null) ss[k] = new Sample((DRFTree) ktrees[k], sample_rate) .dfork(0, new Frame(vec_nids(fr, k), vec_resp(fr, k)), build_tree_one_node); for (int k = 0; k < _nclass; k++) if (ss[k] != null) ss[k].getResult(); Log.debug(Sys.DRF__, "Sampling took: + " + t_1); int[] leafs = new int [_nclass]; // Define a "working set" of leaf splits, from leafs[i] to tree._len for each // tree i // ---- // One Big Loop till the ktrees are of proper depth. // Adds a layer to the trees each pass. Timer t_2 = new Timer(); int depth = 0; for (; depth < max_depth; depth++) { if (!Job.isRunning(self())) return null; hcs = buildLayer(fr, ktrees, leafs, hcs, true, build_tree_one_node); // If we did not make any new splits, then the tree is split-to-death if (hcs == null) break; } Log.debug(Sys.DRF__, "Tree build took: " + t_2); // Each tree bottomed-out in a DecidedNode; go 1 more level and insert // LeafNodes to hold predictions. Timer t_3 = new Timer(); for (int k = 0; k < _nclass; k++) { DTree tree = ktrees[k]; if (tree == null) continue; int leaf = leafs[k] = tree.len(); for (int nid = 0; nid < leaf; nid++) { if (tree.node(nid) instanceof DecidedNode) { DecidedNode dn = tree.decided(nid); for (int i = 0; i < dn._nids.length; i++) { int cnid = dn._nids[i]; if (cnid == -1 || // Bottomed out (predictors or responses known constant) tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth (tree.node(cnid) instanceof DecidedNode && // Or not possible to split ((DecidedNode) tree.node(cnid))._split.col() == -1)) { LeafNode ln = new DRFLeafNode(tree, nid); ln._pred = dn.pred(i); // Set prediction into the leaf dn._nids[i] = ln.nid(); // Mark a leaf here } } // Handle the trivial non-splitting tree if (nid == 0 && dn._split.col() == -1) new DRFLeafNode(tree, -1, 0); } } } // -- k-trees are done Log.debug(Sys.DRF__, "Nodes propagation: " + t_3); // ---- // Move rows into the final leaf rows Timer t_4 = new Timer(); CollectPreds cp = new CollectPreds(ktrees, leafs).doAll(fr, build_tree_one_node); if (importance) { if (classification) asVotes(_treeMeasuresOnOOB) .append(cp.rightVotes, cp.allRows); // Track right votes over OOB rows for this tree else /* regression */ asSSE(_treeMeasuresOnOOB).append(cp.sse, cp.allRows); } Log.debug(Sys.DRF__, "CollectPreds done: " + t_4); // Collect leaves stats for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i]; // DEBUG: Print the generated K trees // printGenerateTrees(ktrees); return ktrees; }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_jobKey != null && !Job.isRunning(_jobKey)) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0].start(); boolean doWork = chunkInit(); if (!doWork) return; final boolean obs_weights = _dinfo._weights && !_fr.vecs()[_dinfo.weightChunkId()].isConst(); final double global_weight_sum = obs_weights ? _fr.vecs()[_dinfo.weightChunkId()].mean() * _fr.numRows() : 0; DataInfo.Row row = _dinfo.newDenseRow(); double[] weight_map = null; double relative_chunk_weight = 1; // TODO: store node-local helper arrays in _dinfo -> avoid re-allocation and construction if (obs_weights) { weight_map = new double[nrows]; double weight_sum = 0; for (int i = 0; i < nrows; ++i) { row = _dinfo.extractDenseRow(chunks, i, row); weight_sum += row.weight; weight_map[i] = weight_sum; assert (i == 0 || row.weight == 0 || weight_map[i] > weight_map[i - 1]); } if (weight_sum > 0) { ArrayUtils.div(weight_map, weight_sum); // normalize to 0...1 relative_chunk_weight = global_weight_sum * nrows / _fr.numRows() / weight_sum; } else return; // nothing to do here - all rows have 0 weight } // Example: // _useFraction = 0.8 -> 1 repeat with fraction = 0.8 // _useFraction = 1.0 -> 1 repeat with fraction = 1.0 // _useFraction = 1.1 -> 2 repeats with fraction = 0.55 // _useFraction = 2.1 -> 3 repeats with fraction = 0.7 // _useFraction = 3.0 -> 3 repeats with fraction = 1.0 final int repeats = (int) Math.ceil(_useFraction * relative_chunk_weight); final float fraction = (float) (_useFraction * relative_chunk_weight) / repeats; assert (fraction <= 1.0); final boolean sample = (fraction < 0.999 || obs_weights || _shuffle); final Random skip_rng = sample ? RandomUtils.getRNG( (0x8734093502429734L + _seed + offset) * (_iteration + 0x9823423497823423L)) : null; long num_processed_rows = 0; for (int rep = 0; rep < repeats; ++rep) { for (int row_idx = 0; row_idx < nrows; ++row_idx) { int r = sample ? -1 : 0; // only train with a given number of training samples (fraction*nrows) if (sample && !obs_weights && skip_rng.nextDouble() > fraction) continue; if (obs_weights && num_processed_rows % 2 == 0) { // every second row is randomly sampled -> that way we won't "forget" rare // rows // importance sampling based on inverse of cumulative distribution double key = skip_rng.nextDouble(); r = Arrays.binarySearch(weight_map, 0, nrows, key); // Log.info(Arrays.toString(weight_map)); // Log.info("key: " + key + " idx: " + (r >= 0 ? r : (-r-1))); if (r < 0) r = -r - 1; assert (r == 0 || weight_map[r] > weight_map[r - 1]); } else if (r == -1) { do { r = skip_rng.nextInt(nrows); // random sampling (with replacement) } // if we have weights, and we did the %2 skipping above, then we need to find an alternate // row with non-zero weight while (obs_weights && ((r == 0 && weight_map[0] == 0) || (r > 0 && weight_map[r] == weight_map[r - 1]))); } else { assert (!obs_weights); r = row_idx; // linear scan - slightly faster } assert (r >= 0 && r <= nrows); row = _dinfo.extractDenseRow(chunks, r, row); if (!row.bad) { assert (row.weight > 0); // check that we never process a row that was held out via row.weight = 0 long seed = offset + rep * nrows + r; if (outputs != null && outputs.length > 0) processRow(seed++, row, outputs); else processRow(seed++, row); } num_processed_rows++; } } assert (fraction != 1 || num_processed_rows == repeats * nrows); chunkDone(num_processed_rows); }
// @Ignore("PUBDEV-1648") @Test public void testRandomCarsGrid() { Grid grid = null; GBMModel gbmRebuilt = null; Frame fr = null; Vec old = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); old = fr.remove("economy (mpg)"); fr.add("economy (mpg)", old); // response to last column DKV.put(fr); // Setup random hyperparameter search space HashMap<String, Object[]> hyperParms = new HashMap<>(); hyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian}); // Construct random grid search space Random rng = new Random(); Integer ntreesDim = rng.nextInt(4) + 1; Integer maxDepthDim = rng.nextInt(4) + 1; Integer learnRateDim = rng.nextInt(4) + 1; Integer[] ntreesArr = interval(1, 25); ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr)); Collections.shuffle(ntreesList); Integer[] ntreesSpace = new Integer[ntreesDim]; for (int i = 0; i < ntreesDim; i++) { ntreesSpace[i] = ntreesList.get(i); } Integer[] maxDepthArr = interval(1, 10); ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr)); Collections.shuffle(maxDepthList); Integer[] maxDepthSpace = new Integer[maxDepthDim]; for (int i = 0; i < maxDepthDim; i++) { maxDepthSpace[i] = maxDepthList.get(i); } Double[] learnRateArr = interval(0.01, 1.0, 0.01); ArrayList<Double> learnRateList = new ArrayList<>(Arrays.asList(learnRateArr)); Collections.shuffle(learnRateList); Double[] learnRateSpace = new Double[learnRateDim]; for (int i = 0; i < learnRateDim; i++) { learnRateSpace[i] = learnRateList.get(i); } hyperParms.put("_ntrees", ntreesSpace); hyperParms.put("_max_depth", maxDepthSpace); hyperParms.put("_learn_rate", learnRateSpace); // Fire off a grid search GBMModel.GBMParameters params = new GBMModel.GBMParameters(); params._train = fr._key; params._response_column = "economy (mpg)"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = gs.get(); System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace)); System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace)); System.out.println("learn_rate search space: " + Arrays.toString(learnRateSpace)); // Check that cardinality of grid Model[] ms = grid.getModels(); Integer numModels = ms.length; System.out.println("Grid consists of " + numModels + " models"); assertTrue(numModels == ntreesDim * maxDepthDim * learnRateDim); // Pick a random model from the grid HashMap<String, Object[]> randomHyperParms = new HashMap<>(); randomHyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian}); Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)]; randomHyperParms.put("_ntrees", new Integer[] {ntreeVal}); Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)]; randomHyperParms.put("_max_depth", maxDepthSpace); Double learnRateVal = learnRateSpace[rng.nextInt(learnRateSpace.length)]; randomHyperParms.put("_learn_rate", learnRateSpace); // TODO: GBMModel gbmFromGrid = (GBMModel) g2.model(randomHyperParms).get(); // Rebuild it with it's parameters params._distribution = DistributionFamily.gaussian; params._ntrees = ntreeVal; params._max_depth = maxDepthVal; params._learn_rate = learnRateVal; GBM gbm = new GBM(params); gbmRebuilt = gbm.trainModel().get(); assertTrue(gbm.isStopped()); // Make sure the MSE metrics match // double fromGridMSE = gbmFromGrid._output._scored_train[gbmFromGrid._output._ntrees]._mse; double rebuiltMSE = gbmRebuilt._output._scored_train[gbmRebuilt._output._ntrees]._mse; // System.out.println("The random grid model's MSE: " + fromGridMSE); System.out.println("The rebuilt model's MSE: " + rebuiltMSE); // assertEquals(fromGridMSE, rebuiltMSE); } finally { if (old != null) old.remove(); if (fr != null) fr.remove(); if (grid != null) grid.remove(); if (gbmRebuilt != null) gbmRebuilt.remove(); } }
@Override protected Response serve() { int tasks = 0; int finished = 0; RFModel model = _modelKey.value(); double[] weights = _weights.value(); // Finish refresh after rf model is done and confusion matrix for all trees is computed boolean done = false; int classCol = _classCol.specified() ? _classCol.value() : findResponseIdx(model); tasks = model._totalTrees; finished = model.size(); // Handle cancelled/aborted jobs if (_job.value() != null) { Job jjob = Job.findJob(_job.value()); if (jjob != null && jjob.isCancelled()) return Response.error( jjob.exception == null ? "Job was cancelled by user!" : jjob.exception); } JsonObject response = defaultJsonResponse(); // CM return and possible computation is requested if (!_noCM.value() && (finished == tasks || _iterativeCM.value()) && finished > 0) { // Compute the highest number of trees which is less then a threshold int modelSize = tasks * _refreshThresholdCM.value() / 100; modelSize = modelSize == 0 || finished == tasks ? finished : modelSize * (finished / modelSize); // Get the computing the matrix - if no job is computing, then start a new job Job cmJob = ConfusionTask.make( model, modelSize, _dataKey.value()._key, classCol, weights, _oobee.value()); // Here the the job is running - it saved a CM which can be already finished or in invalid // state. CMFinal confusion = UKV.get(cmJob.dest()); // if the matrix is valid, report it in the JSON if (confusion != null && confusion.valid() && modelSize > 0) { // finished += 1; JsonObject cm = new JsonObject(); JsonArray cmHeader = new JsonArray(); JsonArray matrix = new JsonArray(); cm.addProperty(JSON_CM_TYPE, _oobee.value() ? "OOB error estimate" : "full scoring"); cm.addProperty(JSON_CM_CLASS_ERR, confusion.classError()); cm.addProperty(JSON_CM_ROWS_SKIPPED, confusion.skippedRows()); cm.addProperty(JSON_CM_ROWS, confusion.rows()); // create the header for (String s : cfDomain(confusion, 1024)) cmHeader.add(new JsonPrimitive(s)); cm.add(JSON_CM_HEADER, cmHeader); // add the matrix final int nclasses = confusion.dimension(); JsonArray classErrors = new JsonArray(); for (int crow = 0; crow < nclasses; ++crow) { JsonArray row = new JsonArray(); int classHitScore = 0; for (int ccol = 0; ccol < nclasses; ++ccol) { row.add(new JsonPrimitive(confusion.matrix(crow, ccol))); if (crow != ccol) classHitScore += confusion.matrix(crow, ccol); } // produce infinity members in case of 0.f/0 classErrors.add( new JsonPrimitive( (float) classHitScore / (classHitScore + confusion.matrix(crow, crow)))); matrix.add(row); } cm.add(JSON_CM_CLASSES_ERRORS, classErrors); cm.add(JSON_CM_MATRIX, matrix); cm.addProperty(JSON_CM_TREES, modelSize); response.add(JSON_CM, cm); // Signal end only and only if all trees were generated and confusion matrix is valid done = finished == tasks; } } else if (_noCM.value() && finished == tasks) done = true; // Trees JsonObject trees = new JsonObject(); trees.addProperty(Constants.TREE_COUNT, model.size()); if (model.size() > 0) { trees.add(Constants.TREE_DEPTH, model.depth().toJson()); trees.add(Constants.TREE_LEAVES, model.leaves().toJson()); } response.add(Constants.TREES, trees); // Build a response Response r; if (done) { r = jobDone(response); r.addHeader( "<div class='alert'>" + /*RFScore.link(MODEL_KEY, model._key, "Use this model for scoring.") */ GeneratePredictionsPage .link(model._key, "Predict!") + " </div>"); } else { r = Response.poll(response, finished, tasks); } r.setBuilder(JSON_CM, new ConfusionMatrixBuilder()); r.setBuilder(TREES, new TreeListBuilder()); return r; }
@Test public void testCarsGrid() { Grid<GBMModel.GBMParameters> grid = null; Frame fr = null; Vec old = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); // Remove unique id old = fr.remove("cylinders"); fr.add("cylinders", old.toCategoricalVec()); // response to last column DKV.put(fr); // Setup hyperparameter search space final Double[] legalLearnRateOpts = new Double[] {0.01, 0.1, 0.3}; final Double[] illegalLearnRateOpts = new Double[] {-1.0}; HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() { { put("_ntrees", new Integer[] {1, 2}); put("_distribution", new DistributionFamily[] {DistributionFamily.multinomial}); put("_max_depth", new Integer[] {1, 2, 5}); put("_learn_rate", ArrayUtils.join(legalLearnRateOpts, illegalLearnRateOpts)); } }; // Name of used hyper parameters String[] hyperParamNames = hyperParms.keySet().toArray(new String[hyperParms.size()]); Arrays.sort(hyperParamNames); int hyperSpaceSize = ArrayUtils.crossProductSize(hyperParms); // Fire off a grid search GBMModel.GBMParameters params = new GBMModel.GBMParameters(); params._train = fr._key; params._response_column = "cylinders"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = (Grid<GBMModel.GBMParameters>) gs.get(); // Make sure number of produced models match size of specified hyper space Assert.assertEquals( "Size of grid (models+failures) should match to size of hyper space", hyperSpaceSize, grid.getModelCount() + grid.getFailureCount()); // // Make sure that names of used parameters match // String[] gridHyperNames = grid.getHyperNames(); Arrays.sort(gridHyperNames); Assert.assertArrayEquals( "Hyper parameters names should match!", hyperParamNames, gridHyperNames); // // Make sure that values of used parameters match as well to the specified values // Key<Model>[] mKeys = grid.getModelKeys(); Map<String, Set<Object>> usedHyperParams = GridTestUtils.initMap(hyperParamNames); for (Key<Model> mKey : mKeys) { GBMModel gbm = (GBMModel) mKey.get(); System.out.println( gbm._output._scored_train[gbm._output._ntrees]._mse + " " + Arrays.deepToString( ArrayUtils.zip(grid.getHyperNames(), grid.getHyperValues(gbm._parms)))); GridTestUtils.extractParams(usedHyperParams, gbm._parms, hyperParamNames); } // Remove illegal options hyperParms.put("_learn_rate", legalLearnRateOpts); GridTestUtils.assertParamsEqual( "Grid models parameters have to cover specified hyper space", hyperParms, usedHyperParams); // Verify model failure Map<String, Set<Object>> failedHyperParams = GridTestUtils.initMap(hyperParamNames); ; for (Model.Parameters failedParams : grid.getFailedParameters()) { GridTestUtils.extractParams(failedHyperParams, failedParams, hyperParamNames); } hyperParms.put("_learn_rate", illegalLearnRateOpts); GridTestUtils.assertParamsEqual( "Failed model parameters have to correspond to specified hyper space", hyperParms, failedHyperParams); } finally { if (old != null) { old.remove(); } if (fr != null) { fr.remove(); } if (grid != null) { grid.remove(); } } }
/** * Check if given job is running. * * @param job_key job key * @return true if job is still running else returns false. */ public static boolean isRunning(Key job_key) { Job j = UKV.get(job_key); assert j != null : "Job should be always in DKV!"; return j.isRunning(); }