Ejemplo n.º 1
  public final Vec[] vecs() {
    if (_vecs != null) return _vecs;
    // Load all Vec headers; load them all in parallel by spawning F/J tasks.
    final Vec[] vecs = new Vec[_keys.length];
    Futures fs = new Futures();
    for (int i = 0; i < _keys.length; i++) {
      final int ii = i;
      final Key k = _keys[i];
      H2OCountedCompleter t =
          new H2OCountedCompleter() {
            // We need higher priority here as there is a danger of deadlock in
            // case of many calls from MRTask2 at once (e.g. frame with many
            // vectors invokes rollup tasks for all vectors in parallel).  Should
            // probably be done in CPS style in the future
            public byte priority() {
              return H2O.MIN_HI_PRIORITY;

            public void compute2() {
              vecs[ii] = DKV.get(k).get();
    return _vecs = vecs;
Ejemplo n.º 2
 /** Clean-up code which is executed after each {@link Job#exec()} call in any case (normal/exceptional). */
 protected void cleanup() {
   // Clean-up global list of temporary vectors
   Futures fs = new Futures();
   cleanupTrash(_gVecTrash, fs);
   if (!_lVecTrash.isEmpty()) cleanupTrash(_lVecTrash, fs);
Ejemplo n.º 3
  @SuppressWarnings("unused") // called through reflection by RequestServer
  public RemoveAllV3 remove(int version, RemoveAllV3 u) {
    Log.info("Removing all objects");
    Futures fs = new Futures();
    for (Job j : Job.jobs()) {
    // Bulk brainless key removal.  Completely wipes all Keys without regard.
    new MRTask() {
      public byte priority() {
        return H2O.GUI_PRIORITY;

      public void setupLocal() {
    Log.info("Finished removing objects");
    return u;
Ejemplo n.º 4
  * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on
  * shuffled OOB rows and variable importance is recomputed.
  * <p>The <a
  * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says:
  * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes
  * cast for the correct class. Now randomly permute the values of variable m in the oob cases and
  * put these cases down the tree. Subtract the number of votes for the correct class in the
  * variable-m-permuted oob data from the number of votes for the correct class in the untouched
  * oob data. The average of this number over all trees in the forest is the raw importance score
  * for variable m." </cite>
 protected VarImp doVarImpCalc(
     final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) {
   // Check if we have already serialized 'ktrees'-trees in the model
   assert model.ntrees() - 1 == tid
       : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid;
   assert _treeMeasuresOnOOB.npredictors() - 1 == tid
       : "Tree votes over OOB rows for this tree (var ktrees) were not found!";
   // Compute tree votes over shuffled data
   final CompressedTree[ /*nclass*/] theTree =
       model.ctree(tid); // get the last tree FIXME we should pass only keys
   final int nclasses = model.nclasses();
   Futures fs = new Futures();
   for (int var = 0; var < _ncols; var++) {
     final int variable = var;
     H2OCountedCompleter task4var =
             ? new H2OCountedCompleter() {
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeVotes cd =
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
             : /* regression */ new H2OCountedCompleter() {
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeSSE cd =
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
     H2O.submitTask(task4var); // Fork computation
   fs.blockForPending(); // Wait for results
   // Compute varimp for individual features (_ncols)
   final float[] varimp = new float[_ncols]; // output variable importance
   final float[] varimpSD = new float[_ncols]; // output variable importance sd
   for (int var = 0; var < _ncols; var++) {
     double[ /*2*/] imp =
             ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB))
             : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB));
     varimp[var] = (float) imp[0];
     varimpSD[var] = (float) imp[1];
   return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees());
Ejemplo n.º 5
 // Will fail if locked by anybody other than 'job_key'
 public void delete(Key job_key, float dummy) {
   if (_key != null) {
     Log.debug(Log.Tag.Sys.LOCKS, "lock-then-delete " + _key + " by job " + job_key);
     new PriorWriteLock(job_key).invoke(_key);
   Futures fs = new Futures();
   if (_key != null) DKV.remove(_key, fs); // Delete self also
Ejemplo n.º 6
  * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one +
  * one-to-all)
  * @param fr Input frame
  * @param seed RNG seed
  * @param shuffle whether to shuffle the data globally
  * @return Shuffled frame
 public static Frame shuffleAndBalance(
     final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) {
   if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) {
     Vec[] vecs = fr.vecs().clone();
     Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks.");
     long[] idx = null;
     if (shuffle) {
       idx = new long[splits];
       for (int r = 0; r < idx.length; ++r) idx[r] = r;
       Utils.shuffleArray(idx, seed);
     Key keys[] = new Vec.VectorGroup().addVecs(vecs.length);
     final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits));
     // loop over cols (same indexing for each column)
     Futures fs = new Futures();
     for (int col = 0; col < vecs.length; col++) {
       AppendableVec vec = new AppendableVec(keys[col]);
       // create outgoing chunks for this col
       NewChunk[] outCkg = new NewChunk[splits];
       for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i);
       // loop over all incoming chunks
       for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) {
         final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg);
         // loop over local rows of incoming chunks (fast path)
         for (int row = 0; row < inCkg._len; ++row) {
           int outCkgIdx =
               (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx
           if (shuffle)
             outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk
           assert (outCkgIdx >= 0 && outCkgIdx < splits);
       for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs);
       Vec t = vec.close(fs);
       t._domain = vecs[col]._domain;
       vecs[col] = t;
     Log.info("Load balancing done.");
     return new Frame(fr.names(), vecs);
   return fr;
Ejemplo n.º 7
 // Convert a chunk# into a chunk - does lazy-chunk creation. As chunks are
 // asked-for the first time, we make the Key and an empty backing DVec.
 // Touching the DVec will force the file load.
 public Value chunkIdx(int cidx) {
   final long nchk = nChunks();
   assert 0 <= cidx && cidx < nchk;
   Key dkey = chunkKey(cidx);
   Value val1 = DKV.get(dkey); // Check for an existing one... will fetch data as needed
   if (val1 != null) return val1; // Found an existing one?
   // Lazily create a DVec for this chunk
   int len = (int) (cidx < nchk - 1 ? CHUNK_SZ : (_len - chunk2StartElem(cidx)));
   // DVec is just the raw file data with a null-compression scheme
   Value val2 = new Value(dkey, len, null, TypeMap.C1NCHUNK, _be);
   val2.setdsk(); // It is already on disk.
   // If not-home, then block till the Key is everywhere.  Most calls here are
   // from the parser loading a text file, and the parser splits the work such
   // that most puts here are on home - so this is a simple speed optimization:
   // do not make a Futures nor block on it on home.
   Futures fs = dkey.home() ? null : new Futures();
   // Atomically insert: fails on a race, but then return the old version
   Value val3 = DKV.DputIfMatch(dkey, val2, null, fs);
   if (!dkey.home() && fs != null) fs.blockForPending();
   return val3 == null ? val2 : val3;
Ejemplo n.º 8
 public static Key makeByteVec(Key k, String... data) {
   byte[][] chunks = new byte[data.length][];
   long[] espc = new long[data.length + 1];
   for (int i = 0; i < chunks.length; ++i) {
     chunks[i] = data[i].getBytes();
     espc[i + 1] = espc[i] + data[i].length();
   Futures fs = new Futures();
   Key key = Vec.newKey();
   ByteVec bv = new ByteVec(key, Vec.ESPC.rowLayout(key, espc));
   for (int i = 0; i < chunks.length; ++i) {
     Key chunkKey = bv.chunkKey(i);
         new Value(chunkKey, chunks[i].length, chunks[i], TypeMap.C1NCHUNK, Value.ICE),
   DKV.put(bv._key, bv, fs);
   Frame fr = new Frame(k, new String[] {"makeByteVec"}, new Vec[] {bv});
   DKV.put(k, fr, fs);
   return k;
Ejemplo n.º 9
  public static Frame[] shuffleSplitFrame(
      Frame fr, Key[] keys, final double ratios[], final long seed) {
    // Sanity check the ratios
    assert keys.length == ratios.length;
    double sum = ratios[0];
    for (int i = 1; i < ratios.length; i++) {
      sum += ratios[i];
      ratios[i] = sum;
    assert water.util.MathUtils.equalsWithinOneSmallUlp(sum, 1.0);

    // Do the split, into ratios.length groupings of NewChunks
    final int ncols = fr.numCols();
    MRTask mr =
        new MRTask() {
          public void map(Chunk cs[], NewChunk ncs[]) {
            Random rng = new Random(seed * cs[0].cidx());
            int nrows = cs[0]._len;
            for (int i = 0; i < nrows; i++) {
              double r = rng.nextDouble();
              int x = 0; // Pick the NewChunk split
              for (; x < ratios.length - 1; x++) if (r < ratios[x]) break;
              x *= ncols;
              // Helper string holder
              ValueString vstr = new ValueString();
              // Copy row to correct set of NewChunks
              for (int j = 0; j < ncols; j++) {
                byte colType = cs[j].vec().get_type();
                switch (colType) {
                  case Vec.T_BAD:
                    break; /* NOP */
                  case Vec.T_STR:
                    ncs[x + j].addStr(cs[j], i);
                  case Vec.T_UUID:
                    ncs[x + j].addUUID(cs[j], i);
                  case Vec.T_NUM: /* fallthrough */
                  case Vec.T_ENUM:
                  case Vec.T_TIME:
                    ncs[x + j].addNum(cs[j].atd(i));
                    if (colType > Vec.T_TIME && colType <= Vec.T_TIMELAST)
                      ncs[x + j].addNum(cs[j].atd(i));
                    else throw new IllegalArgumentException("Unsupported vector type: " + colType);
        }.doAll(ncols * ratios.length, fr);

    // Build output frames
    Frame frames[] = new Frame[ratios.length];
    Vec[] vecs = fr.vecs();
    String[] names = fr.names();
    Futures fs = new Futures();
    for (int i = 0; i < ratios.length; i++) {
      Vec[] nvecs = new Vec[ncols];
      for (int c = 0; c < ncols; c++) {
        mr.appendables()[i * ncols + c].setDomain(vecs[c].domain());
        nvecs[c] = mr.appendables()[i * ncols + c].close(fs);
      frames[i] = new Frame(keys[i], fr.names(), nvecs);
      DKV.put(frames[i], fs);
    return frames;
Ejemplo n.º 10
 /** User call which empty local trash of vectors. */
 protected final void emptyLTrash() {
   if (_lVecTrash.isEmpty()) return;
   Futures fs = new Futures();
   cleanupTrash(_lVecTrash, fs);
Ejemplo n.º 11
 public static void remove(Key key) {
   Futures fs = new Futures();
   remove(key, fs); // Recursively delete, gather pending deletes
   fs.blockForPending(); // Block until all is deleted
Ejemplo n.º 12
 // This put is a top-level user-update, and not a reflected or retried
 // update.  i.e., The User has initiated a change against the K/V store.
 // This is a WEAK update: it is only strongly ordered with other updates to
 // the SAME key on the SAME node.
 public static void put(Key key, Value val) {
   Futures fs = new Futures();
   put(key, val, fs);
   fs.blockForPending(); // Block for remote-put to complete