Пример #1
0
  // --------------------------------------------------------------------------
  // Build an entire layer of all K trees
  protected DHistogram[][][] buildLayer(
      final Frame fr,
      final int nbins,
      int nbins_cats,
      final DTree ktrees[],
      final int leafs[],
      final DHistogram hcs[][][],
      boolean subset,
      boolean build_tree_one_node) {
    // Build K trees, one per class.

    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass];
    Vec vecs[] = fr.vecs();
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      // Build a frame with just a single tree (& work & nid) columns, so the
      // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try
      // to close other tree's Vecs when run in parallel.
      Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
      fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]);
      fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]);
      fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]);
      if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]);
      // Start building one of the K trees in parallel
      H2O.submitTask(
          sb1ts[k] =
              new ScoreBuildOneTree(
                  this,
                  k,
                  nbins,
                  nbins_cats,
                  tree,
                  leafs,
                  hcs,
                  fr2,
                  subset,
                  build_tree_one_node,
                  _improvPerVar,
                  _model._parms._distribution));
    }
    // Block for all K trees to complete.
    boolean did_split = false;
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      sb1ts[k].join();
      if (sb1ts[k]._did_split) did_split = true;
    }
    // The layer is done.
    return did_split ? hcs : null;
  }
Пример #2
0
  // KMeans++ re-clustering
  private static double[][] recluster(
      double[][] points, Random rand, int N, Initialization init, String[][] isCats) {
    double[][] res = new double[N][];
    res[0] = points[0];
    int count = 1;
    ClusterDist cd = new ClusterDist();
    switch (init) {
      case Random:
        break;
      case PlusPlus:
        { // k-means++
          while (count < res.length) {
            double sum = 0;
            for (double[] point1 : points) sum += minSqr(res, point1, isCats, cd, count);

            for (double[] point : points) {
              if (minSqr(res, point, isCats, cd, count) >= rand.nextDouble() * sum) {
                res[count++] = point;
                break;
              }
            }
          }
          break;
        }
      case Furthest:
        { // Takes cluster center further from any already chosen ones
          while (count < res.length) {
            double max = 0;
            int index = 0;
            for (int i = 0; i < points.length; i++) {
              double sqr = minSqr(res, points[i], isCats, cd, count);
              if (sqr > max) {
                max = sqr;
                index = i;
              }
            }
            res[count++] = points[index];
          }
          break;
        }
      default:
        throw H2O.fail();
    }
    return res;
  }
Пример #3
0
 @Override
 protected Frame rebalance(final Frame original_fr, boolean local, final String name) {
   if (original_fr == null) return null;
   if (_parms._force_load_balance) {
     int original_chunks = original_fr.anyVec().nChunks();
     _job.update(0, "Load balancing " + name.substring(name.length() - 5) + " data...");
     int chunks = desiredChunks(original_fr, local);
     if (!_parms._reproducible) {
       if (original_chunks >= chunks) {
         if (!_parms._quiet_mode)
           Log.info(
               "Dataset already contains " + original_chunks + " chunks. No need to rebalance.");
         return original_fr;
       }
     } else { // reproducible, set chunks to 1
       assert chunks == 1;
       if (!_parms._quiet_mode)
         Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
       if (original_chunks == 1) return original_fr;
     }
     if (!_parms._quiet_mode)
       Log.info(
           "Rebalancing "
               + name.substring(name.length() - 5)
               + " dataset into "
               + chunks
               + " chunks.");
     Key newKey = Key.make(name + ".chks" + chunks);
     RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks);
     H2O.submitTask(rb).join();
     Frame rebalanced_fr = DKV.get(newKey).get();
     Scope.track(rebalanced_fr);
     return rebalanced_fr;
   }
   return original_fr;
 }
Пример #4
0
    public static void userMain(String[] args) {
      H2O.main(args);
      TestUtil.stall_till_cloudsize(NODES);

      List<Class> tests = new ArrayList<Class>();

      // Classes to test:
      // tests = JUnitRunner.all();

      // Neural Net - deprecated
      //      tests.add(NeuralNetSpiralsTest.class); //compare NeuralNet vs reference
      //      tests.add(NeuralNetIrisTest.class); //compare NeuralNet vs reference

      // Chunk tests
      //      tests.add(C0LChunkTest.class);
      //      tests.add(C0DChunkTest.class);
      //      tests.add(C1ChunkTest.class);
      //      tests.add(C1NChunkTest.class);
      //      tests.add(C1SChunkTest.class);
      //      tests.add(C2ChunkTest.class);
      //      tests.add(C2SChunkTest.class);
      //      tests.add(C4ChunkTest.class);
      //      tests.add(C4FChunkTest.class);
      //      tests.add(C4SChunkTest.class);
      //      tests.add(C8ChunkTest.class);
      //      tests.add(C8DChunkTest.class);
      //      tests.add(C16ChunkTest.class);
      //      tests.add(CBSChunkTest.class);
      //      tests.add(CX0ChunkTest.class);
      //      tests.add(CXIChunkTest.class);
      //      tests.add(CXDChunkTest.class);
      //      tests.add(VecTest.class);

      // Deep Learning tests
      //      tests.add(DeepLearningVsNeuralNet.class); //only passes for NODES=1, not clear why
      //      tests.add(DeepLearningAutoEncoderTest.class); //test Deep Learning convergence
      //      tests.add(DeepLearningAutoEncoderCategoricalTest.class); //test Deep Learning
      // convergence
      //      tests.add(DeepLearningSpiralsTest.class); //test Deep Learning convergence
      //      tests.add(DeepLearningIrisTest.Short.class); //compare Deep Learning vs reference
      //      tests.add(DeepLearningIrisTest.Long.class); //compare Deep Learning vs reference
      tests.add(DeepLearningProstateTest.Short.class); // test Deep Learning
      //      tests.add(DeepLearningMissingTest.class); //test Deep Learning
      //      tests.add(DeepLearningProstateTest.Long.class); //test Deep Learning
      //      tests.add(NeuronsTest.class); //test Deep Learning
      //      tests.add(MRUtilsTest.class); //test MR sampling/rebalancing
      //      tests.add(DropoutTest.class); //test NN Dropput

      //      tests.add(ParserTest2.class);
      //      tests.add(ParserTest2.ParseAllSmalldata.class);
      //      tests.add(KMeans2Test.class);
      //      tests.add(KMeans2RandomTest.class);
      //      tests.add(GLMRandomTest.Short.class);
      //      tests.add(SpeeDRFTest.class);
      //      tests.add(SpeeDRFTest2.class);
      ////      tests.add(GLMTest2.class);
      //      tests.add(DRFTest.class);
      //      tests.add(DRFTest2.class);
      //      tests.add(GBMTest.class);
      //      tests.add(KMeans2Test.class);
      //      tests.add(PCATest.class);
      //      tests.add(NetworkTestTest.class);

      // Uncomment this to sleep here and use the browser.
      // try { Thread.sleep(10000000); } catch (Exception _) {}

      JUnitCore junit = new JUnitCore();
      junit.addListener(new LogListener());
      Result result = junit.run(tests.toArray(new Class[0]));
      if (result.getFailures().size() == 0) {
        Log.info("SUCCESS!");
        System.exit(0);
      } else {
        Log.info("FAIL!");
        System.exit(1);
      }
    }
Пример #5
0
 /**
  * Simple GLM wrapper to enable launching GLM from command line.
  *
  * <p>Example input: java -jar target/h2o.jar -name=test -runMethod water.util.GLMRunner
  * -file=smalldata/logreg/prostate.csv -y=CAPSULE -family=binomial
  *
  * @param args
  * @throws InterruptedException
  */
 public static void main(String[] args) throws InterruptedException {
   try {
     GLMArgs ARGS = new GLMArgs();
     new Arguments(args).extract(ARGS);
     System.out.println("==================<GLMRunner START>===================");
     ValueArray ary = Utils.loadAndParseKey(ARGS.file);
     int ycol;
     try {
       ycol = Integer.parseInt(ARGS.y);
     } catch (NumberFormatException e) {
       ycol = ary.getColumnIds(new String[] {ARGS.y})[0];
     }
     int ncols = ary.numCols();
     if (ycol < 0 || ycol >= ary.numCols()) {
       System.err.println("invalid y column: " + ycol);
       H2O.exit(-1);
     }
     int[] xcols;
     if (ARGS.xs.equalsIgnoreCase("all")) {
       xcols = new int[ncols - 1];
       for (int i = 0; i < ycol; ++i) xcols[i] = i;
       for (int i = ycol; i < ncols - 1; ++i) xcols[i] = i + 1;
     } else {
       System.out.println("xs = " + ARGS.xs);
       String[] names = ARGS.xs.split(",");
       xcols = new int[names.length];
       try {
         for (int i = 0; i < names.length; ++i) xcols[i] = Integer.valueOf(names[i]);
       } catch (NumberFormatException e) {
         xcols = ary.getColumnIds(ARGS.xs.split(","));
       }
     }
     for (int x : xcols)
       if (x < 0) {
         System.err.println("Invalid predictor specification " + ARGS.xs);
         H2O.exit(-1);
       }
     GLMJob j =
         DGLM.startGLMJob(
             DGLM.getData(ary, xcols, ycol, null, true),
             new ADMMSolver(ARGS.lambda, ARGS._alpha),
             new GLMParams(Family.valueOf(ARGS.family)),
             null,
             ARGS.xval,
             true);
     System.out.print("[GLM] computing model...");
     int progress = 0;
     while (!j.isDone()) {
       int p = (int) (100 * j.progress());
       int dots = p - progress;
       progress = p;
       for (int i = 0; i < dots; ++i) System.out.print('.');
       Thread.sleep(250);
     }
     Log.debug(Sys.GENLM, "DONE.");
     GLMModel m = j.get();
     String[] colnames = ary.colNames();
     System.out.println("Intercept" + " = " + m._beta[ncols - 1]);
     for (int i = 0; i < xcols.length; ++i) {
       System.out.println(colnames[i] + " = " + m._beta[i]);
     }
   } catch (Throwable t) {
     Log.err(t);
   } finally { // we're done. shutdown the cloud
     Log.debug(Sys.GENLM, "==================<GLMRunner DONE>===================");
     UDPRebooted.suicide(UDPRebooted.T.shutdown, H2O.SELF);
   }
 }
Пример #6
0
 @Override
 protected double[] score0(double[] data, double[] preds) {
   throw H2O.unimpl();
 }
 public ModelBuilderSchema schema() {
   H2O.unimpl();
   return null;
   //  return new CoxPHV2();
 }