private static void addFolder2( FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) { try { if (fs == null) return; Futures futures = new Futures(); for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder2(fs, pfs, keys, failed); } else { long size = file.getLen(); Key res; if (pfs.getName().endsWith(Extensions.JSON)) { throw H2O.unimpl(); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? throw H2O.unimpl(); } else { Key k = null; keys.add((k = HdfsFileVec.make(file, futures)).toString()); Log.info("PersistHdfs: DKV.put(" + k + ")"); } } } } catch (Exception e) { Log.err(e); failed.add(p.toString()); } }
private static void ignoreAndWait(final Exception e, boolean printException) { H2O.ignore(e, "Hit HDFS reset problem, retrying...", printException); try { Thread.sleep(500); } catch (InterruptedException ie) { } }
/** * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on * shuffled OOB rows and variable importance is recomputed. * * <p>The <a * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says: * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes * cast for the correct class. Now randomly permute the values of variable m in the oob cases and * put these cases down the tree. Subtract the number of votes for the correct class in the * variable-m-permuted oob data from the number of votes for the correct class in the untouched * oob data. The average of this number over all trees in the forest is the raw importance score * for variable m." </cite> */ @Override protected VarImp doVarImpCalc( final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) { // Check if we have already serialized 'ktrees'-trees in the model assert model.ntrees() - 1 == tid : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid; assert _treeMeasuresOnOOB.npredictors() - 1 == tid : "Tree votes over OOB rows for this tree (var ktrees) were not found!"; // Compute tree votes over shuffled data final CompressedTree[ /*nclass*/] theTree = model.ctree(tid); // get the last tree FIXME we should pass only keys final int nclasses = model.nclasses(); Futures fs = new Futures(); for (int var = 0; var < _ncols; var++) { final int variable = var; H2OCountedCompleter task4var = classification ? new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeVotes cd = TreeMeasuresCollector.collectVotes( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asVotes(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } } : /* regression */ new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeSSE cd = TreeMeasuresCollector.collectSSE( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asSSE(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } }; H2O.submitTask(task4var); // Fork computation fs.add(task4var); } fs.blockForPending(); // Wait for results // Compute varimp for individual features (_ncols) final float[] varimp = new float[_ncols]; // output variable importance final float[] varimpSD = new float[_ncols]; // output variable importance sd for (int var = 0; var < _ncols; var++) { double[ /*2*/] imp = classification ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB)) : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB)); varimp[var] = (float) imp[0]; varimpSD[var] = (float) imp[1]; } return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees()); }
// -------------------------------------------------------------------------- // Build an entire layer of all K trees protected DHistogram[][][] buildLayer( final Frame fr, final int nbins, int nbins_cats, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) { // Build K trees, one per class. // Build up the next-generation tree splits from the current histograms. // Nearly all leaves will split one more level. This loop nest is // O( #active_splits * #bins * #ncols ) // but is NOT over all the data. ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass]; Vec vecs[] = fr.vecs(); for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; // Build a frame with just a single tree (& work & nid) columns, so the // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try // to close other tree's Vecs when run in parallel. Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1)); fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]); fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]); fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]); if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]); // Start building one of the K trees in parallel H2O.submitTask( sb1ts[k] = new ScoreBuildOneTree( this, k, nbins, nbins_cats, tree, leafs, hcs, fr2, subset, build_tree_one_node, _improvPerVar, _model._parms._distribution)); } // Block for all K trees to complete. boolean did_split = false; for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; sb1ts[k].join(); if (sb1ts[k]._did_split) did_split = true; } // The layer is done. return did_split ? hcs : null; }
// KMeans++ re-clustering private static double[][] recluster( double[][] points, Random rand, int N, Initialization init, String[][] isCats) { double[][] res = new double[N][]; res[0] = points[0]; int count = 1; ClusterDist cd = new ClusterDist(); switch (init) { case Random: break; case PlusPlus: { // k-means++ while (count < res.length) { double sum = 0; for (double[] point1 : points) sum += minSqr(res, point1, isCats, cd, count); for (double[] point : points) { if (minSqr(res, point, isCats, cd, count) >= rand.nextDouble() * sum) { res[count++] = point; break; } } } break; } case Furthest: { // Takes cluster center further from any already chosen ones while (count < res.length) { double max = 0; int index = 0; for (int i = 0; i < points.length; i++) { double sqr = minSqr(res, points[i], isCats, cd, count); if (sqr > max) { max = sqr; index = i; } } res[count++] = points[index]; } break; } default: throw H2O.fail(); } return res; }
public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) { final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest); new ValueArray(dest, 0).delete_and_lock(job.self()); final H2OCountedCompleter fjtask = new H2OCountedCompleter() { @Override public void compute2() { KMeansApply kms = new KMeansApply(); kms._job = job; kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); Column c = new Column(); c._name = Constants.RESPONSE; c._size = ROW_SIZE; c._scale = 1; c._min = 0; c._max = model._clusters.length; c._mean = Double.NaN; c._sigma = Double.NaN; c._domain = null; c._n = ary.numRows(); ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c}); res.unlock(job.self()); job.remove(); tryComplete(); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { job.onException(ex); return super.onExceptionalCompletion(ex, caller); } }; job.start(fjtask); H2O.submitTask(fjtask); return job; }
private void cancel(final String msg, JobState resultingState) { if (resultingState == JobState.CANCELLED) { Log.info("Job " + self() + "(" + description + ") was cancelled."); } else { Log.err("Job " + self() + "(" + description + ") failed."); Log.err(msg); } exception = msg; state = resultingState; // replace finished job by a job handle replaceByJobHandle(); DKV.write_barrier(); final Job job = this; H2O.submitTask( new H2OCountedCompleter() { @Override public void compute2() { job.onCancelled(); } }); }
/** * Forks computation of this job. * * <p>The call does not block. * * @return always returns this job. */ public Job fork() { init(); H2OCountedCompleter task = new H2OCountedCompleter() { @Override public void compute2() { try { try { // Exec always waits till the end of computation Job.this.exec(); Job.this.remove(); } catch (Throwable t) { if (!(t instanceof ExpectedExceptionForDebug)) Log.err(t); Job.this.cancel(t); } } finally { tryComplete(); } } }; start(task); H2O.submitTask(task); return this; }
/** Single row scoring, on a compatible ValueArray (when pushed throw the mapping) */ @Override protected double score0(ValueArray data, int row) { throw H2O.unimpl(); }
/** Bulk scoring API, on a compatible ValueArray (when pushed throw the mapping) */ @Override protected double score0(ValueArray data, AutoBuffer ab, int row_in_chunk) { throw H2O.unimpl(); }
@Override protected double[] score0(double[] data, double[] preds) { throw H2O.unimpl(); }
protected Response serve_debug() { throw H2O.unimpl(); }
public ModelBuilderSchema schema() { H2O.unimpl(); return null; // return new CoxPHV2(); }
/** * Cross-Validate this Job (to be overridden for each instance, which also calls * genericCrossValidation) * * @param splits Frames containing train/test splits * @param cv_preds Store the predictions for each cross-validation run * @param offsets Array to store the offsets of starting row indices for each cross-validation * run * @param i Which fold of cross-validation to perform */ public void crossValidate(Frame[] splits, Frame[] cv_preds, long[] offsets, int i) { throw H2O.unimpl(); }