// -------------------------------------------------------------------------- // Build an entire layer of all K trees protected DHistogram[][][] buildLayer( final Frame fr, final int nbins, int nbins_cats, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) { // Build K trees, one per class. // Build up the next-generation tree splits from the current histograms. // Nearly all leaves will split one more level. This loop nest is // O( #active_splits * #bins * #ncols ) // but is NOT over all the data. ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass]; Vec vecs[] = fr.vecs(); for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; // Build a frame with just a single tree (& work & nid) columns, so the // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try // to close other tree's Vecs when run in parallel. Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1)); fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]); fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]); fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]); if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]); // Start building one of the K trees in parallel H2O.submitTask( sb1ts[k] = new ScoreBuildOneTree( this, k, nbins, nbins_cats, tree, leafs, hcs, fr2, subset, build_tree_one_node, _improvPerVar, _model._parms._distribution)); } // Block for all K trees to complete. boolean did_split = false; for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; sb1ts[k].join(); if (sb1ts[k]._did_split) did_split = true; } // The layer is done. return did_split ? hcs : null; }
// KMeans++ re-clustering private static double[][] recluster( double[][] points, Random rand, int N, Initialization init, String[][] isCats) { double[][] res = new double[N][]; res[0] = points[0]; int count = 1; ClusterDist cd = new ClusterDist(); switch (init) { case Random: break; case PlusPlus: { // k-means++ while (count < res.length) { double sum = 0; for (double[] point1 : points) sum += minSqr(res, point1, isCats, cd, count); for (double[] point : points) { if (minSqr(res, point, isCats, cd, count) >= rand.nextDouble() * sum) { res[count++] = point; break; } } } break; } case Furthest: { // Takes cluster center further from any already chosen ones while (count < res.length) { double max = 0; int index = 0; for (int i = 0; i < points.length; i++) { double sqr = minSqr(res, points[i], isCats, cd, count); if (sqr > max) { max = sqr; index = i; } } res[count++] = points[index]; } break; } default: throw H2O.fail(); } return res; }
@Override protected Frame rebalance(final Frame original_fr, boolean local, final String name) { if (original_fr == null) return null; if (_parms._force_load_balance) { int original_chunks = original_fr.anyVec().nChunks(); _job.update(0, "Load balancing " + name.substring(name.length() - 5) + " data..."); int chunks = desiredChunks(original_fr, local); if (!_parms._reproducible) { if (original_chunks >= chunks) { if (!_parms._quiet_mode) Log.info( "Dataset already contains " + original_chunks + " chunks. No need to rebalance."); return original_fr; } } else { // reproducible, set chunks to 1 assert chunks == 1; if (!_parms._quiet_mode) Log.warn("Reproducibility enforced - using only 1 thread - can be slow."); if (original_chunks == 1) return original_fr; } if (!_parms._quiet_mode) Log.info( "Rebalancing " + name.substring(name.length() - 5) + " dataset into " + chunks + " chunks."); Key newKey = Key.make(name + ".chks" + chunks); RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks); H2O.submitTask(rb).join(); Frame rebalanced_fr = DKV.get(newKey).get(); Scope.track(rebalanced_fr); return rebalanced_fr; } return original_fr; }
public static void userMain(String[] args) { H2O.main(args); TestUtil.stall_till_cloudsize(NODES); List<Class> tests = new ArrayList<Class>(); // Classes to test: // tests = JUnitRunner.all(); // Neural Net - deprecated // tests.add(NeuralNetSpiralsTest.class); //compare NeuralNet vs reference // tests.add(NeuralNetIrisTest.class); //compare NeuralNet vs reference // Chunk tests // tests.add(C0LChunkTest.class); // tests.add(C0DChunkTest.class); // tests.add(C1ChunkTest.class); // tests.add(C1NChunkTest.class); // tests.add(C1SChunkTest.class); // tests.add(C2ChunkTest.class); // tests.add(C2SChunkTest.class); // tests.add(C4ChunkTest.class); // tests.add(C4FChunkTest.class); // tests.add(C4SChunkTest.class); // tests.add(C8ChunkTest.class); // tests.add(C8DChunkTest.class); // tests.add(C16ChunkTest.class); // tests.add(CBSChunkTest.class); // tests.add(CX0ChunkTest.class); // tests.add(CXIChunkTest.class); // tests.add(CXDChunkTest.class); // tests.add(VecTest.class); // Deep Learning tests // tests.add(DeepLearningVsNeuralNet.class); //only passes for NODES=1, not clear why // tests.add(DeepLearningAutoEncoderTest.class); //test Deep Learning convergence // tests.add(DeepLearningAutoEncoderCategoricalTest.class); //test Deep Learning // convergence // tests.add(DeepLearningSpiralsTest.class); //test Deep Learning convergence // tests.add(DeepLearningIrisTest.Short.class); //compare Deep Learning vs reference // tests.add(DeepLearningIrisTest.Long.class); //compare Deep Learning vs reference tests.add(DeepLearningProstateTest.Short.class); // test Deep Learning // tests.add(DeepLearningMissingTest.class); //test Deep Learning // tests.add(DeepLearningProstateTest.Long.class); //test Deep Learning // tests.add(NeuronsTest.class); //test Deep Learning // tests.add(MRUtilsTest.class); //test MR sampling/rebalancing // tests.add(DropoutTest.class); //test NN Dropput // tests.add(ParserTest2.class); // tests.add(ParserTest2.ParseAllSmalldata.class); // tests.add(KMeans2Test.class); // tests.add(KMeans2RandomTest.class); // tests.add(GLMRandomTest.Short.class); // tests.add(SpeeDRFTest.class); // tests.add(SpeeDRFTest2.class); //// tests.add(GLMTest2.class); // tests.add(DRFTest.class); // tests.add(DRFTest2.class); // tests.add(GBMTest.class); // tests.add(KMeans2Test.class); // tests.add(PCATest.class); // tests.add(NetworkTestTest.class); // Uncomment this to sleep here and use the browser. // try { Thread.sleep(10000000); } catch (Exception _) {} JUnitCore junit = new JUnitCore(); junit.addListener(new LogListener()); Result result = junit.run(tests.toArray(new Class[0])); if (result.getFailures().size() == 0) { Log.info("SUCCESS!"); System.exit(0); } else { Log.info("FAIL!"); System.exit(1); } }
/** * Simple GLM wrapper to enable launching GLM from command line. * * <p>Example input: java -jar target/h2o.jar -name=test -runMethod water.util.GLMRunner * -file=smalldata/logreg/prostate.csv -y=CAPSULE -family=binomial * * @param args * @throws InterruptedException */ public static void main(String[] args) throws InterruptedException { try { GLMArgs ARGS = new GLMArgs(); new Arguments(args).extract(ARGS); System.out.println("==================<GLMRunner START>==================="); ValueArray ary = Utils.loadAndParseKey(ARGS.file); int ycol; try { ycol = Integer.parseInt(ARGS.y); } catch (NumberFormatException e) { ycol = ary.getColumnIds(new String[] {ARGS.y})[0]; } int ncols = ary.numCols(); if (ycol < 0 || ycol >= ary.numCols()) { System.err.println("invalid y column: " + ycol); H2O.exit(-1); } int[] xcols; if (ARGS.xs.equalsIgnoreCase("all")) { xcols = new int[ncols - 1]; for (int i = 0; i < ycol; ++i) xcols[i] = i; for (int i = ycol; i < ncols - 1; ++i) xcols[i] = i + 1; } else { System.out.println("xs = " + ARGS.xs); String[] names = ARGS.xs.split(","); xcols = new int[names.length]; try { for (int i = 0; i < names.length; ++i) xcols[i] = Integer.valueOf(names[i]); } catch (NumberFormatException e) { xcols = ary.getColumnIds(ARGS.xs.split(",")); } } for (int x : xcols) if (x < 0) { System.err.println("Invalid predictor specification " + ARGS.xs); H2O.exit(-1); } GLMJob j = DGLM.startGLMJob( DGLM.getData(ary, xcols, ycol, null, true), new ADMMSolver(ARGS.lambda, ARGS._alpha), new GLMParams(Family.valueOf(ARGS.family)), null, ARGS.xval, true); System.out.print("[GLM] computing model..."); int progress = 0; while (!j.isDone()) { int p = (int) (100 * j.progress()); int dots = p - progress; progress = p; for (int i = 0; i < dots; ++i) System.out.print('.'); Thread.sleep(250); } Log.debug(Sys.GENLM, "DONE."); GLMModel m = j.get(); String[] colnames = ary.colNames(); System.out.println("Intercept" + " = " + m._beta[ncols - 1]); for (int i = 0; i < xcols.length; ++i) { System.out.println(colnames[i] + " = " + m._beta[i]); } } catch (Throwable t) { Log.err(t); } finally { // we're done. shutdown the cloud Log.debug(Sys.GENLM, "==================<GLMRunner DONE>==================="); UDPRebooted.suicide(UDPRebooted.T.shutdown, H2O.SELF); } }
@Override protected double[] score0(double[] data, double[] preds) { throw H2O.unimpl(); }
public ModelBuilderSchema schema() { H2O.unimpl(); return null; // return new CoxPHV2(); }