Пример #1
0
  public final Vec[] vecs() {
    if (_vecs != null) return _vecs;
    // Load all Vec headers; load them all in parallel by spawning F/J tasks.
    final Vec[] vecs = new Vec[_keys.length];
    Futures fs = new Futures();
    for (int i = 0; i < _keys.length; i++) {
      final int ii = i;
      final Key k = _keys[i];
      H2OCountedCompleter t =
          new H2OCountedCompleter() {
            // We need higher priority here as there is a danger of deadlock in
            // case of many calls from MRTask2 at once (e.g. frame with many
            // vectors invokes rollup tasks for all vectors in parallel).  Should
            // probably be done in CPS style in the future
            @Override
            public byte priority() {
              return H2O.MIN_HI_PRIORITY;
            }

            @Override
            public void compute2() {
              vecs[ii] = DKV.get(k).get();
              tryComplete();
            }
          };
      H2O.submitTask(t);
      fs.add(t);
    }
    fs.blockForPending();
    return _vecs = vecs;
  }
Пример #2
0
 /**
  * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on
  * shuffled OOB rows and variable importance is recomputed.
  *
  * <p>The <a
  * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says:
  * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes
  * cast for the correct class. Now randomly permute the values of variable m in the oob cases and
  * put these cases down the tree. Subtract the number of votes for the correct class in the
  * variable-m-permuted oob data from the number of votes for the correct class in the untouched
  * oob data. The average of this number over all trees in the forest is the raw importance score
  * for variable m." </cite>
  */
 @Override
 protected VarImp doVarImpCalc(
     final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) {
   // Check if we have already serialized 'ktrees'-trees in the model
   assert model.ntrees() - 1 == tid
       : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid;
   assert _treeMeasuresOnOOB.npredictors() - 1 == tid
       : "Tree votes over OOB rows for this tree (var ktrees) were not found!";
   // Compute tree votes over shuffled data
   final CompressedTree[ /*nclass*/] theTree =
       model.ctree(tid); // get the last tree FIXME we should pass only keys
   final int nclasses = model.nclasses();
   Futures fs = new Futures();
   for (int var = 0; var < _ncols; var++) {
     final int variable = var;
     H2OCountedCompleter task4var =
         classification
             ? new H2OCountedCompleter() {
               @Override
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeVotes cd =
                     TreeMeasuresCollector.collectVotes(
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
                 asVotes(_treeMeasuresOnSOOB[variable]).append(cd);
                 tryComplete();
               }
             }
             : /* regression */ new H2OCountedCompleter() {
               @Override
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeSSE cd =
                     TreeMeasuresCollector.collectSSE(
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
                 asSSE(_treeMeasuresOnSOOB[variable]).append(cd);
                 tryComplete();
               }
             };
     H2O.submitTask(task4var); // Fork computation
     fs.add(task4var);
   }
   fs.blockForPending(); // Wait for results
   // Compute varimp for individual features (_ncols)
   final float[] varimp = new float[_ncols]; // output variable importance
   final float[] varimpSD = new float[_ncols]; // output variable importance sd
   for (int var = 0; var < _ncols; var++) {
     double[ /*2*/] imp =
         classification
             ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB))
             : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB));
     varimp[var] = (float) imp[0];
     varimpSD[var] = (float) imp[1];
   }
   return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees());
 }
Пример #3
0
  // --------------------------------------------------------------------------
  // Build an entire layer of all K trees
  protected DHistogram[][][] buildLayer(
      final Frame fr,
      final int nbins,
      int nbins_cats,
      final DTree ktrees[],
      final int leafs[],
      final DHistogram hcs[][][],
      boolean subset,
      boolean build_tree_one_node) {
    // Build K trees, one per class.

    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass];
    Vec vecs[] = fr.vecs();
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      // Build a frame with just a single tree (& work & nid) columns, so the
      // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try
      // to close other tree's Vecs when run in parallel.
      Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
      fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]);
      fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]);
      fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]);
      if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]);
      // Start building one of the K trees in parallel
      H2O.submitTask(
          sb1ts[k] =
              new ScoreBuildOneTree(
                  this,
                  k,
                  nbins,
                  nbins_cats,
                  tree,
                  leafs,
                  hcs,
                  fr2,
                  subset,
                  build_tree_one_node,
                  _improvPerVar,
                  _model._parms._distribution));
    }
    // Block for all K trees to complete.
    boolean did_split = false;
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      sb1ts[k].join();
      if (sb1ts[k]._did_split) did_split = true;
    }
    // The layer is done.
    return did_split ? hcs : null;
  }
Пример #4
0
  @Override
  public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);
    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);
    final int nsplits = templates.length;
    assert nsplits == ratios.length + 1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s = 0; s < nsplits; s++) {
      Frame split = new Frame(destKeys[s], dataset.names(), templates[s]);
      split.delete_and_lock(jobKey);
      splits[s] = split;
    }
    setPendingCount(1);
    H2O.submitTask(
        new H2OCountedCompleter(FrameSplitter.this) {
          @Override
          public void compute2() {
            setPendingCount(nsplits);
            for (int s = 0; s < nsplits; s++) {
              new FrameSplitTask(
                      new H2OCountedCompleter(this) { // Completer for this task
                        @Override
                        public void compute2() {}

                        @Override
                        public boolean onExceptionalCompletion(
                            Throwable ex, CountedCompleter caller) {
                          synchronized (
                              FrameSplitter
                                  .this) { // synchronized on this since can be accessed from
                            // different workers
                            workersExceptions =
                                workersExceptions != null
                                    ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1)
                                    : new Throwable[1];
                            workersExceptions[workersExceptions.length - 1] = ex;
                          }
                          tryComplete(); // we handle the exception so wait perform normal
                          // completion
                          return false;
                        }
                      },
                      datasetVecs,
                      ratios,
                      s)
                  .asyncExec(splits[s]);
            }
            tryComplete(); // complete the computation of nsplits-tasks
          }
        });
    tryComplete(); // complete the computation of thrown tasks
  }
Пример #5
0
  @Test
  public void testChunks() {
    Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");

    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._radius_scale = 3.0;
    long start = System.currentTimeMillis();
    AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.418
    System.out.println(
        "AggregatorModel finished in: "
            + (System.currentTimeMillis() - start) / 1000.
            + " seconds");
    agg.checkConsistency();
    Frame output = agg._output._output_frame.get();
    Log.info("Number of exemplars: " + agg._exemplars.length);
    //    Assert.assertTrue(agg._exemplars.length==1993);
    output.remove();
    agg.remove();

    for (int i : new int[] {1, 2, 5, 10, 50, 100}) {
      Key key = Key.make();
      RebalanceDataSet rb = new RebalanceDataSet(frame, key, i);
      H2O.submitTask(rb);
      rb.join();
      Frame rebalanced = DKV.get(key).get();

      parms = new AggregatorModel.AggregatorParameters();
      parms._train = frame._key;
      parms._radius_scale = 3.0;
      start = System.currentTimeMillis();
      AggregatorModel agg2 =
          new Aggregator(parms).trainModel().get(); // 0.373 0.504 0.357 0.454 0.368 0.355
      System.out.println(
          "AggregatorModel finished in: "
              + (System.currentTimeMillis() - start) / 1000.
              + " seconds");
      agg2.checkConsistency();
      Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length);
      rebalanced.delete();
      Assert.assertTrue(
          Math.abs(agg._exemplars.length - agg2._exemplars.length)
              == 0); // < agg._exemplars.length*0);
      output = agg2._output._output_frame.get();
      output.remove();
      agg2.remove();
    }
    frame.delete();
  }
Пример #6
0
    @Override
    public Job fork() {
      DKV.put(destination_key, new GLMGrid(self(), _jobs));
      assert _maxParallelism >= 1;
      final H2OCountedCompleter fjt =
          new H2OCallback<ParallelGLMs>() {
            @Override
            public void callback(ParallelGLMs pgs) {

              remove();
            }
          };
      start(fjt);
      H2O.submitTask(new ParallelGLMs(this, _jobs, H2O.CLOUD.size(), fjt));
      return this;
    }
Пример #7
0
    public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) {
      final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest);
      new ValueArray(dest, 0).delete_and_lock(job.self());
      final H2OCountedCompleter fjtask =
          new H2OCountedCompleter() {
            @Override
            public void compute2() {
              KMeansApply kms = new KMeansApply();
              kms._job = job;
              kms._arykey = ary._key;
              kms._cols = model.columnMapping(ary.colNames());
              kms._clusters = model._clusters;
              kms._normalized = model._normalized;
              kms.invoke(ary._key);

              Column c = new Column();
              c._name = Constants.RESPONSE;
              c._size = ROW_SIZE;
              c._scale = 1;
              c._min = 0;
              c._max = model._clusters.length;
              c._mean = Double.NaN;
              c._sigma = Double.NaN;
              c._domain = null;
              c._n = ary.numRows();
              ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c});
              res.unlock(job.self());
              job.remove();
              tryComplete();
            }

            @Override
            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
              job.onException(ex);
              return super.onExceptionalCompletion(ex, caller);
            }
          };
      job.start(fjtask);
      H2O.submitTask(fjtask);
      return job;
    }
Пример #8
0
 @Override
 protected Frame rebalance(final Frame original_fr, boolean local, final String name) {
   if (original_fr == null) return null;
   if (_parms._force_load_balance) {
     int original_chunks = original_fr.anyVec().nChunks();
     _job.update(0, "Load balancing " + name.substring(name.length() - 5) + " data...");
     int chunks = desiredChunks(original_fr, local);
     if (!_parms._reproducible) {
       if (original_chunks >= chunks) {
         if (!_parms._quiet_mode)
           Log.info(
               "Dataset already contains " + original_chunks + " chunks. No need to rebalance.");
         return original_fr;
       }
     } else { // reproducible, set chunks to 1
       assert chunks == 1;
       if (!_parms._quiet_mode)
         Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
       if (original_chunks == 1) return original_fr;
     }
     if (!_parms._quiet_mode)
       Log.info(
           "Rebalancing "
               + name.substring(name.length() - 5)
               + " dataset into "
               + chunks
               + " chunks.");
     Key newKey = Key.make(name + ".chks" + chunks);
     RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks);
     H2O.submitTask(rb).join();
     Frame rebalanced_fr = DKV.get(newKey).get();
     Scope.track(rebalanced_fr);
     return rebalanced_fr;
   }
   return original_fr;
 }
Пример #9
0
 private void xvalidate(final GLMModel model, int lambdaIxd, final H2OCountedCompleter cmp) {
   final Key[] keys = new Key[n_folds];
   GLM2[] glms = new GLM2[n_folds];
   for (int i = 0; i < n_folds; ++i)
     glms[i] =
         new GLM2(
             this.description + "xval " + i,
             self(),
             keys[i] = Key.make(destination_key + "_" + _lambdaIdx + "_xval" + i),
             _dinfo.getFold(i, n_folds),
             _glm,
             new double[] {lambda[_lambdaIdx]},
             model.alpha,
             0,
             model.beta_eps,
             self(),
             model.norm_beta(lambdaIxd),
             higher_accuracy,
             prior,
             0);
   H2O.submitTask(
       new ParallelGLMs(
           GLM2.this,
           glms,
           H2O.CLOUD.size(),
           new H2OCallback(GLM2.this) {
             @Override
             public void callback(H2OCountedCompleter t) {
               GLMModel[] models = new GLMModel[keys.length];
               // we got the xval models, now compute their validations...
               for (int i = 0; i < models.length; ++i) models[i] = DKV.get(keys[i]).get();
               new GLMXValidationTask(model, _lambdaIdx, models, cmp)
                   .asyncExec(_dinfo._adaptedFrame);
             }
           }));
 }
Пример #10
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {

    // Execute all args.  Find a canonical frame; all Frames must look like this one.
    // Each argument turns into either a Frame (whose rows are entirely
    // inlined) or a scalar (which is replicated across as a single row).
    Frame fr = null; // Canonical Frame; all frames have the same column count, types and names
    int nchks = 0; // Total chunks
    Val vals[] = new Val[asts.length]; // Computed AST results
    for (int i = 1; i < asts.length; i++) {
      vals[i] = stk.track(asts[i].exec(env));
      if (vals[i].isFrame()) {
        fr = vals[i].getFrame();
        nchks += fr.anyVec().nChunks(); // Total chunks
      } else nchks++; // One chunk per scalar
    }
    // No Frame, just a pile-o-scalars?
    Vec zz = null; // The zero-length vec for the zero-frame frame
    if (fr == null) { // Zero-length, 1-column, default name
      fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)});
      if (asts.length == 1) return new ValFrame(fr);
    }

    // Verify all Frames are the same columns, names, and types.  Domains can vary, and will be the
    // union
    final Frame frs[] = new Frame[asts.length]; // Input frame
    final byte[] types = fr.types(); // Column types
    final int ncols = fr.numCols();
    final long[] espc = new long[nchks + 1]; // Compute a new layout!
    int coffset = 0;

    for (int i = 1; i < asts.length; i++) {
      Val val = vals[i]; // Save values computed for pass 2
      Frame fr0 =
          val.isFrame()
              ? val.getFrame()
              // Scalar: auto-expand into a 1-row frame
              : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols())));

      // Check that all frames are compatible
      if (fr.numCols() != fr0.numCols())
        throw new IllegalArgumentException(
            "rbind frames must have all the same columns, found "
                + fr.numCols()
                + " and "
                + fr0.numCols()
                + " columns.");
      if (!Arrays.deepEquals(fr._names, fr0._names))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column names, found "
                + Arrays.toString(fr._names)
                + " and "
                + Arrays.toString(fr0._names));
      if (!Arrays.equals(types, fr0.types()))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column types, found "
                + Arrays.toString(types)
                + " and "
                + Arrays.toString(fr0.types()));

      frs[i] = fr0; // Save frame

      // Roll up the ESPC row counts
      long roffset = espc[coffset];
      long[] espc2 = fr0.anyVec().espc();
      for (int j = 1; j < espc2.length; j++) // Roll up the row counts
      espc[coffset + j] = (roffset + espc2[j]);
      coffset += espc2.length - 1; // Chunk offset
    }
    if (zz != null) zz.remove();

    // build up the new domains for each vec
    HashMap<String, Integer>[] dmap = new HashMap[types.length];
    String[][] domains = new String[types.length][];
    int[][][] cmaps = new int[types.length][][];
    for (int k = 0; k < types.length; ++k) {
      dmap[k] = new HashMap<>();
      int c = 0;
      byte t = types[k];
      if (t == Vec.T_CAT) {
        int[][] maps = new int[frs.length][];
        for (int i = 1; i < frs.length; i++) {
          maps[i] = new int[frs[i].vec(k).domain().length];
          for (int j = 0; j < maps[i].length; j++) {
            String s = frs[i].vec(k).domain()[j];
            if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++);
            else maps[i][j] = dmap[k].get(s);
          }
        }
        cmaps[k] = maps;
      } else {
        cmaps[k] = new int[frs.length][];
      }
      domains[k] = c == 0 ? null : new String[c];
      for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey();
    }

    // Now make Keys for the new Vecs
    Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols());
    Vec[] vecs = new Vec[fr.numCols()];
    int rowLayout = Vec.ESPC.rowLayout(keys[0], espc);
    for (int i = 0; i < vecs.length; i++)
      vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]);

    // Do the row-binds column-by-column.
    // Switch to F/J thread for continuations
    ParallelRbinds t;
    H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join();
    return new ValFrame(new Frame(fr.names(), t._vecs));
  }