public final Vec[] vecs() { if (_vecs != null) return _vecs; // Load all Vec headers; load them all in parallel by spawning F/J tasks. final Vec[] vecs = new Vec[_keys.length]; Futures fs = new Futures(); for (int i = 0; i < _keys.length; i++) { final int ii = i; final Key k = _keys[i]; H2OCountedCompleter t = new H2OCountedCompleter() { // We need higher priority here as there is a danger of deadlock in // case of many calls from MRTask2 at once (e.g. frame with many // vectors invokes rollup tasks for all vectors in parallel). Should // probably be done in CPS style in the future @Override public byte priority() { return H2O.MIN_HI_PRIORITY; } @Override public void compute2() { vecs[ii] = DKV.get(k).get(); tryComplete(); } }; H2O.submitTask(t); fs.add(t); } fs.blockForPending(); return _vecs = vecs; }
static void invalidate(H2ONode h2o, Key key, Value newval, Futures fs) { assert newval._key != null && key.home(); // Prevent the new Value from being overwritten by Yet Another PUT by // read-locking it. It's safe to read, but not to over-write, until this // invalidate completes on the *prior* value. newval.read_lock(); // block further writes until all invalidates complete fs.add(RPC.call(h2o, new TaskInvalidateKey(key, newval))); }
/** * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on * shuffled OOB rows and variable importance is recomputed. * * <p>The <a * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says: * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes * cast for the correct class. Now randomly permute the values of variable m in the oob cases and * put these cases down the tree. Subtract the number of votes for the correct class in the * variable-m-permuted oob data from the number of votes for the correct class in the untouched * oob data. The average of this number over all trees in the forest is the raw importance score * for variable m." </cite> */ @Override protected VarImp doVarImpCalc( final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) { // Check if we have already serialized 'ktrees'-trees in the model assert model.ntrees() - 1 == tid : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid; assert _treeMeasuresOnOOB.npredictors() - 1 == tid : "Tree votes over OOB rows for this tree (var ktrees) were not found!"; // Compute tree votes over shuffled data final CompressedTree[ /*nclass*/] theTree = model.ctree(tid); // get the last tree FIXME we should pass only keys final int nclasses = model.nclasses(); Futures fs = new Futures(); for (int var = 0; var < _ncols; var++) { final int variable = var; H2OCountedCompleter task4var = classification ? new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeVotes cd = TreeMeasuresCollector.collectVotes( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asVotes(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } } : /* regression */ new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeSSE cd = TreeMeasuresCollector.collectSSE( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asSSE(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } }; H2O.submitTask(task4var); // Fork computation fs.add(task4var); } fs.blockForPending(); // Wait for results // Compute varimp for individual features (_ncols) final float[] varimp = new float[_ncols]; // output variable importance final float[] varimpSD = new float[_ncols]; // output variable importance sd for (int var = 0; var < _ncols; var++) { double[ /*2*/] imp = classification ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB)) : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB)); varimp[var] = (float) imp[0]; varimpSD[var] = (float) imp[1]; } return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees()); }
static void put(H2ONode h2o, Key key, Value val, Futures fs, boolean dontCache) { fs.add(RPC.call(h2o, new TaskPutKey(key, val, dontCache))); }