public final Vec[] vecs() { if (_vecs != null) return _vecs; // Load all Vec headers; load them all in parallel by spawning F/J tasks. final Vec[] vecs = new Vec[_keys.length]; Futures fs = new Futures(); for (int i = 0; i < _keys.length; i++) { final int ii = i; final Key k = _keys[i]; H2OCountedCompleter t = new H2OCountedCompleter() { // We need higher priority here as there is a danger of deadlock in // case of many calls from MRTask2 at once (e.g. frame with many // vectors invokes rollup tasks for all vectors in parallel). Should // probably be done in CPS style in the future @Override public byte priority() { return H2O.MIN_HI_PRIORITY; } @Override public void compute2() { vecs[ii] = DKV.get(k).get(); tryComplete(); } }; H2O.submitTask(t); fs.add(t); } fs.blockForPending(); return _vecs = vecs; }
/** * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on * shuffled OOB rows and variable importance is recomputed. * * <p>The <a * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says: * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes * cast for the correct class. Now randomly permute the values of variable m in the oob cases and * put these cases down the tree. Subtract the number of votes for the correct class in the * variable-m-permuted oob data from the number of votes for the correct class in the untouched * oob data. The average of this number over all trees in the forest is the raw importance score * for variable m." </cite> */ @Override protected VarImp doVarImpCalc( final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) { // Check if we have already serialized 'ktrees'-trees in the model assert model.ntrees() - 1 == tid : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid; assert _treeMeasuresOnOOB.npredictors() - 1 == tid : "Tree votes over OOB rows for this tree (var ktrees) were not found!"; // Compute tree votes over shuffled data final CompressedTree[ /*nclass*/] theTree = model.ctree(tid); // get the last tree FIXME we should pass only keys final int nclasses = model.nclasses(); Futures fs = new Futures(); for (int var = 0; var < _ncols; var++) { final int variable = var; H2OCountedCompleter task4var = classification ? new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeVotes cd = TreeMeasuresCollector.collectVotes( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asVotes(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } } : /* regression */ new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeSSE cd = TreeMeasuresCollector.collectSSE( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asSSE(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } }; H2O.submitTask(task4var); // Fork computation fs.add(task4var); } fs.blockForPending(); // Wait for results // Compute varimp for individual features (_ncols) final float[] varimp = new float[_ncols]; // output variable importance final float[] varimpSD = new float[_ncols]; // output variable importance sd for (int var = 0; var < _ncols; var++) { double[ /*2*/] imp = classification ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB)) : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB)); varimp[var] = (float) imp[0]; varimpSD[var] = (float) imp[1]; } return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees()); }
// -------------------------------------------------------------------------- // Build an entire layer of all K trees protected DHistogram[][][] buildLayer( final Frame fr, final int nbins, int nbins_cats, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) { // Build K trees, one per class. // Build up the next-generation tree splits from the current histograms. // Nearly all leaves will split one more level. This loop nest is // O( #active_splits * #bins * #ncols ) // but is NOT over all the data. ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass]; Vec vecs[] = fr.vecs(); for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; // Build a frame with just a single tree (& work & nid) columns, so the // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try // to close other tree's Vecs when run in parallel. Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1)); fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]); fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]); fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]); if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]); // Start building one of the K trees in parallel H2O.submitTask( sb1ts[k] = new ScoreBuildOneTree( this, k, nbins, nbins_cats, tree, leafs, hcs, fr2, subset, build_tree_one_node, _improvPerVar, _model._parms._distribution)); } // Block for all K trees to complete. boolean did_split = false; for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; sb1ts[k].join(); if (sb1ts[k]._did_split) did_split = true; } // The layer is done. return did_split ? hcs : null; }
@Override public void compute2() { // Lock all possible data dataset.read_lock(jobKey); // Create a template vector for each segment final Vec[][] templates = makeTemplates(dataset, ratios); final int nsplits = templates.length; assert nsplits == ratios.length + 1 : "Unexpected number of split templates!"; // Launch number of distributed FJ for each split part final Vec[] datasetVecs = dataset.vecs(); splits = new Frame[nsplits]; for (int s = 0; s < nsplits; s++) { Frame split = new Frame(destKeys[s], dataset.names(), templates[s]); split.delete_and_lock(jobKey); splits[s] = split; } setPendingCount(1); H2O.submitTask( new H2OCountedCompleter(FrameSplitter.this) { @Override public void compute2() { setPendingCount(nsplits); for (int s = 0; s < nsplits; s++) { new FrameSplitTask( new H2OCountedCompleter(this) { // Completer for this task @Override public void compute2() {} @Override public boolean onExceptionalCompletion( Throwable ex, CountedCompleter caller) { synchronized ( FrameSplitter .this) { // synchronized on this since can be accessed from // different workers workersExceptions = workersExceptions != null ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1) : new Throwable[1]; workersExceptions[workersExceptions.length - 1] = ex; } tryComplete(); // we handle the exception so wait perform normal // completion return false; } }, datasetVecs, ratios, s) .asyncExec(splits[s]); } tryComplete(); // complete the computation of nsplits-tasks } }); tryComplete(); // complete the computation of thrown tasks }
@Test public void testChunks() { Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data"); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 3.0; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.418 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); Frame output = agg._output._output_frame.get(); Log.info("Number of exemplars: " + agg._exemplars.length); // Assert.assertTrue(agg._exemplars.length==1993); output.remove(); agg.remove(); for (int i : new int[] {1, 2, 5, 10, 50, 100}) { Key key = Key.make(); RebalanceDataSet rb = new RebalanceDataSet(frame, key, i); H2O.submitTask(rb); rb.join(); Frame rebalanced = DKV.get(key).get(); parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 3.0; start = System.currentTimeMillis(); AggregatorModel agg2 = new Aggregator(parms).trainModel().get(); // 0.373 0.504 0.357 0.454 0.368 0.355 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg2.checkConsistency(); Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length); rebalanced.delete(); Assert.assertTrue( Math.abs(agg._exemplars.length - agg2._exemplars.length) == 0); // < agg._exemplars.length*0); output = agg2._output._output_frame.get(); output.remove(); agg2.remove(); } frame.delete(); }
@Override public Job fork() { DKV.put(destination_key, new GLMGrid(self(), _jobs)); assert _maxParallelism >= 1; final H2OCountedCompleter fjt = new H2OCallback<ParallelGLMs>() { @Override public void callback(ParallelGLMs pgs) { remove(); } }; start(fjt); H2O.submitTask(new ParallelGLMs(this, _jobs, H2O.CLOUD.size(), fjt)); return this; }
public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) { final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest); new ValueArray(dest, 0).delete_and_lock(job.self()); final H2OCountedCompleter fjtask = new H2OCountedCompleter() { @Override public void compute2() { KMeansApply kms = new KMeansApply(); kms._job = job; kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); Column c = new Column(); c._name = Constants.RESPONSE; c._size = ROW_SIZE; c._scale = 1; c._min = 0; c._max = model._clusters.length; c._mean = Double.NaN; c._sigma = Double.NaN; c._domain = null; c._n = ary.numRows(); ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c}); res.unlock(job.self()); job.remove(); tryComplete(); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { job.onException(ex); return super.onExceptionalCompletion(ex, caller); } }; job.start(fjtask); H2O.submitTask(fjtask); return job; }
@Override protected Frame rebalance(final Frame original_fr, boolean local, final String name) { if (original_fr == null) return null; if (_parms._force_load_balance) { int original_chunks = original_fr.anyVec().nChunks(); _job.update(0, "Load balancing " + name.substring(name.length() - 5) + " data..."); int chunks = desiredChunks(original_fr, local); if (!_parms._reproducible) { if (original_chunks >= chunks) { if (!_parms._quiet_mode) Log.info( "Dataset already contains " + original_chunks + " chunks. No need to rebalance."); return original_fr; } } else { // reproducible, set chunks to 1 assert chunks == 1; if (!_parms._quiet_mode) Log.warn("Reproducibility enforced - using only 1 thread - can be slow."); if (original_chunks == 1) return original_fr; } if (!_parms._quiet_mode) Log.info( "Rebalancing " + name.substring(name.length() - 5) + " dataset into " + chunks + " chunks."); Key newKey = Key.make(name + ".chks" + chunks); RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks); H2O.submitTask(rb).join(); Frame rebalanced_fr = DKV.get(newKey).get(); Scope.track(rebalanced_fr); return rebalanced_fr; } return original_fr; }
private void xvalidate(final GLMModel model, int lambdaIxd, final H2OCountedCompleter cmp) { final Key[] keys = new Key[n_folds]; GLM2[] glms = new GLM2[n_folds]; for (int i = 0; i < n_folds; ++i) glms[i] = new GLM2( this.description + "xval " + i, self(), keys[i] = Key.make(destination_key + "_" + _lambdaIdx + "_xval" + i), _dinfo.getFold(i, n_folds), _glm, new double[] {lambda[_lambdaIdx]}, model.alpha, 0, model.beta_eps, self(), model.norm_beta(lambdaIxd), higher_accuracy, prior, 0); H2O.submitTask( new ParallelGLMs( GLM2.this, glms, H2O.CLOUD.size(), new H2OCallback(GLM2.this) { @Override public void callback(H2OCountedCompleter t) { GLMModel[] models = new GLMModel[keys.length]; // we got the xval models, now compute their validations... for (int i = 0; i < models.length; ++i) models[i] = DKV.get(keys[i]).get(); new GLMXValidationTask(model, _lambdaIdx, models, cmp) .asyncExec(_dinfo._adaptedFrame); } })); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Execute all args. Find a canonical frame; all Frames must look like this one. // Each argument turns into either a Frame (whose rows are entirely // inlined) or a scalar (which is replicated across as a single row). Frame fr = null; // Canonical Frame; all frames have the same column count, types and names int nchks = 0; // Total chunks Val vals[] = new Val[asts.length]; // Computed AST results for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { fr = vals[i].getFrame(); nchks += fr.anyVec().nChunks(); // Total chunks } else nchks++; // One chunk per scalar } // No Frame, just a pile-o-scalars? Vec zz = null; // The zero-length vec for the zero-frame frame if (fr == null) { // Zero-length, 1-column, default name fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)}); if (asts.length == 1) return new ValFrame(fr); } // Verify all Frames are the same columns, names, and types. Domains can vary, and will be the // union final Frame frs[] = new Frame[asts.length]; // Input frame final byte[] types = fr.types(); // Column types final int ncols = fr.numCols(); final long[] espc = new long[nchks + 1]; // Compute a new layout! int coffset = 0; for (int i = 1; i < asts.length; i++) { Val val = vals[i]; // Save values computed for pass 2 Frame fr0 = val.isFrame() ? val.getFrame() // Scalar: auto-expand into a 1-row frame : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols()))); // Check that all frames are compatible if (fr.numCols() != fr0.numCols()) throw new IllegalArgumentException( "rbind frames must have all the same columns, found " + fr.numCols() + " and " + fr0.numCols() + " columns."); if (!Arrays.deepEquals(fr._names, fr0._names)) throw new IllegalArgumentException( "rbind frames must have all the same column names, found " + Arrays.toString(fr._names) + " and " + Arrays.toString(fr0._names)); if (!Arrays.equals(types, fr0.types())) throw new IllegalArgumentException( "rbind frames must have all the same column types, found " + Arrays.toString(types) + " and " + Arrays.toString(fr0.types())); frs[i] = fr0; // Save frame // Roll up the ESPC row counts long roffset = espc[coffset]; long[] espc2 = fr0.anyVec().espc(); for (int j = 1; j < espc2.length; j++) // Roll up the row counts espc[coffset + j] = (roffset + espc2[j]); coffset += espc2.length - 1; // Chunk offset } if (zz != null) zz.remove(); // build up the new domains for each vec HashMap<String, Integer>[] dmap = new HashMap[types.length]; String[][] domains = new String[types.length][]; int[][][] cmaps = new int[types.length][][]; for (int k = 0; k < types.length; ++k) { dmap[k] = new HashMap<>(); int c = 0; byte t = types[k]; if (t == Vec.T_CAT) { int[][] maps = new int[frs.length][]; for (int i = 1; i < frs.length; i++) { maps[i] = new int[frs[i].vec(k).domain().length]; for (int j = 0; j < maps[i].length; j++) { String s = frs[i].vec(k).domain()[j]; if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++); else maps[i][j] = dmap[k].get(s); } } cmaps[k] = maps; } else { cmaps[k] = new int[frs.length][]; } domains[k] = c == 0 ? null : new String[c]; for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey(); } // Now make Keys for the new Vecs Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols()); Vec[] vecs = new Vec[fr.numCols()]; int rowLayout = Vec.ESPC.rowLayout(keys[0], espc); for (int i = 0; i < vecs.length; i++) vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]); // Do the row-binds column-by-column. // Switch to F/J thread for continuations ParallelRbinds t; H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join(); return new ValFrame(new Frame(fr.names(), t._vecs)); }