Example #1
0
  private static void addFolder2(
      FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) {
    try {
      if (fs == null) return;

      Futures futures = new Futures();
      for (FileStatus file : fs.listStatus(p)) {
        Path pfs = file.getPath();
        if (file.isDir()) {
          addFolder2(fs, pfs, keys, failed);
        } else {
          long size = file.getLen();
          Key res;
          if (pfs.getName().endsWith(Extensions.JSON)) {
            throw H2O.unimpl();
          } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
            throw H2O.unimpl();
          } else {
            Key k = null;
            keys.add((k = HdfsFileVec.make(file, futures)).toString());
            Log.info("PersistHdfs: DKV.put(" + k + ")");
          }
        }
      }
    } catch (Exception e) {
      Log.err(e);
      failed.add(p.toString());
    }
  }
Example #2
0
 @Override
 public void map(Chunk chks[], NewChunk nchks[]) {
   long rstart = chks[0]._start;
   int rlen = chks[0]._len; // Total row count
   int rx = 0; // Which row to in/ex-clude
   int rlo = 0; // Lo/Hi for this block of rows
   int rhi = rlen;
   while (true) { // Still got rows to include?
     if (_rows != null) { // Got a row selector?
       if (rx >= _rows.length) break; // All done with row selections
       long r = _rows[rx++] - 1; // Next row selector
       if (r < 0) { // Row exclusion
         if (rx > 0 && _rows[rx - 1] < _rows[rx]) throw H2O.unimpl();
         long er = Math.abs(r) - 2;
         if (er < rstart) continue;
         // scoop up all of the rows before the first exclusion
         if (rx == 1 && ((int) (er + 1 - rstart)) > 0 && _ex) {
           rlo = (int) rstart;
           rhi = (int) (er - rstart);
           _ex = false;
           rx--;
         } else {
           rlo = (int) (er + 1 - rstart);
           // TODO: handle jumbled row indices ( e.g. -c(1,5,3) )
           while (rx < _rows.length && (_rows[rx] + 1 == _rows[rx - 1] && rlo < rlen)) {
             if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl();
             rx++;
             rlo++; // Exclude consecutive rows
           }
           rhi = rx >= _rows.length ? rlen : (int) Math.abs(_rows[rx] - 1) - 2;
           if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl();
         }
       } else { // Positive row list?
         if (r < rstart) continue;
         rlo = (int) (r - rstart);
         rhi = rlo + 1; // Stop at the next row
         while (rx < _rows.length && (_rows[rx] - 1 - rstart) == rhi && rhi < rlen) {
           rx++;
           rhi++; // Grab sequential rows
         }
       }
     }
     // Process this next set of rows
     // For all cols in the new set
     for (int i = 0; i < _cols.length; i++) {
       Chunk oc = chks[_cols[i]];
       NewChunk nc = nchks[i];
       if (oc._vec.isInt()) { // Slice on integer columns
         for (int j = rlo; j < rhi; j++)
           if (oc.isNA0(j)) nc.addNA();
           else nc.addNum(oc.at80(j), 0);
       } else { // Slice on double columns
         for (int j = rlo; j < rhi; j++) nc.addNum(oc.at0(j));
       }
     }
     rlo = rhi;
     if (_rows == null) break;
   }
 }
Example #3
0
 // Set & At on NewChunks are weird: only used after inflating some other
 // chunk.  At this point the NewChunk is full size, no more appends allowed,
 // and the xs exponent array should be only full of zeros.  Accesses must be
 // in-range and refer to the inflated values of the original Chunk.
 @Override
 boolean set_impl(int i, long l) {
   if (_ds != null) throw H2O.unimpl();
   if (_len2 != _len) throw H2O.unimpl();
   _ls[i] = l;
   _xs[i] = 0;
   return true;
 }
 public static void stall_till_cloudsize(String[] args, int x) {
   if (!_stall_called_before) {
     H2O.main(args);
     H2O.registerRestApis(System.getProperty("user.dir"));
     _stall_called_before = true;
   }
   H2O.waitForCloudSize(x, 30000);
   _initial_keycnt = H2O.store_size();
 }
Example #5
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {

    // Compute the variable args.  Find the common row count
    Val vals[] = new Val[asts.length];
    Vec vec = null;
    for (int i = 1; i < asts.length; i++) {
      vals[i] = stk.track(asts[i].exec(env));
      if (vals[i].isFrame()) {
        Vec anyvec = vals[i].getFrame().anyVec();
        if (anyvec == null) continue; // Ignore the empty frame
        if (vec == null) vec = anyvec;
        else if (vec.length() != anyvec.length())
          throw new IllegalArgumentException(
              "cbind frames must have all the same rows, found "
                  + vec.length()
                  + " and "
                  + anyvec.length()
                  + " rows.");
      }
    }
    boolean clean = false;
    if (vec == null) {
      vec = Vec.makeZero(1);
      clean = true;
    } // Default to length 1

    // Populate the new Frame
    Frame fr = new Frame();
    for (int i = 1; i < asts.length; i++) {
      switch (vals[i].type()) {
        case Val.FRM:
          fr.add(fr.makeCompatible(vals[i].getFrame()));
          break;
        case Val.FUN:
          throw H2O.unimpl();
        case Val.STR:
          throw H2O.unimpl();
        case Val.NUM:
          // Auto-expand scalars to fill every row
          double d = vals[i].getNum();
          fr.add(Double.toString(d), vec.makeCon(d));
          break;
        default:
          throw H2O.unimpl();
      }
    }
    if (clean) vec.remove();

    return new ValFrame(fr);
  }
Example #6
0
 private void setTransform(
     TransformType t, double[] normMul, double[] normSub, int vecStart, int n) {
   for (int i = 0; i < n; ++i) {
     Vec v = _adaptedFrame.vec(vecStart + i);
     switch (t) {
       case STANDARDIZE:
         normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         normSub[i] = v.mean();
         break;
       case NORMALIZE:
         normMul[i] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0;
         normSub[i] = v.mean();
         break;
       case DEMEAN:
         normMul[i] = 1;
         normSub[i] = v.mean();
         break;
       case DESCALE:
         normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         normSub[i] = 0;
         break;
       default:
         throw H2O.unimpl();
     }
     assert !Double.isNaN(normMul[i]);
     assert !Double.isNaN(normSub[i]);
   }
 }
Example #7
0
 private void setTransform(
     TransformType t, double[] normMul, double[] normSub, int vecStart, int n) {
   int idx = 0; // idx!=i when interactions are in play, otherwise, it's just 'i'
   for (int i = 0; i < n; ++i) {
     Vec v = _adaptedFrame.vec(vecStart + i);
     boolean isIWV = isInteractionVec(vecStart + i);
     switch (t) {
       case STANDARDIZE:
         normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = v.mean();
         break;
       case NORMALIZE:
         normMul[idx] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = v.mean();
         break;
       case DEMEAN:
         normMul[idx] = 1;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = v.mean();
         break;
       case DESCALE:
         normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = 0;
         break;
       default:
         throw H2O.unimpl();
     }
     assert !Double.isNaN(normMul[idx]);
     assert !Double.isNaN(normSub[idx]);
     idx = isIWV ? (idx + nextNumericIdx(i)) : (idx + 1);
   }
 }
Example #8
0
 // ------------------------------------------------------------------------
 // Zipped file; no parallel decompression; decompress into local chunks,
 // parse local chunks; distribute chunks later.
 ParseWriter streamParseZip(final InputStream is, final StreamParseWriter dout, InputStream bvs)
     throws IOException {
   // All output into a fresh pile of NewChunks, one per column
   if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl();
   StreamData din = new StreamData(is);
   int cidx = 0;
   StreamParseWriter nextChunk = dout;
   int zidx = bvs.read(null, 0, 0); // Back-channel read of chunk index
   assert zidx == 1;
   while (is.available() > 0) {
     int xidx = bvs.read(null, 0, 0); // Back-channel read of chunk index
     if (xidx > zidx) { // Advanced chunk index of underlying ByteVec stream?
       zidx = xidx; // Record advancing of chunk
       nextChunk.close(); // Match output chunks to input zipfile chunks
       if (dout != nextChunk) {
         dout.reduce(nextChunk);
         if (_jobKey != null && ((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed()) break;
       }
       nextChunk = nextChunk.nextChunk();
     }
     parseChunk(cidx++, din, nextChunk);
   }
   parseChunk(cidx, din, nextChunk); // Parse the remaining partial 32K buffer
   nextChunk.close();
   if (dout != nextChunk) dout.reduce(nextChunk);
   return dout;
 }
Example #9
0
    @Override
    protected void doGet(HttpServletRequest request, HttpServletResponse response)
        throws IOException, ServletException {
      String uri = getDecodedUri(request);
      try {
        Pattern p = Pattern.compile(".*/NodePersistentStorage.bin/([^/]+)/([^/]+)");
        Matcher m = p.matcher(uri);
        boolean b = m.matches();
        if (!b) {
          setResponseStatus(response, HttpServletResponse.SC_BAD_REQUEST);
          response.getWriter().write("Improperly formatted URI");
          return;
        }

        String categoryName = m.group(1);
        String keyName = m.group(2);
        NodePersistentStorage nps = H2O.getNPS();
        AtomicLong length = new AtomicLong();
        InputStream is = nps.get(categoryName, keyName, length);
        if (length.get() > (long) Integer.MAX_VALUE) {
          throw new Exception("NPS value size exceeds Integer.MAX_VALUE");
        }
        response.setContentType("application/octet-stream");
        response.setContentLength((int) length.get());
        response.addHeader("Content-Disposition", "attachment; filename=" + keyName + ".flow");
        setResponseStatus(response, HttpServletResponse.SC_OK);
        OutputStream os = response.getOutputStream();
        water.util.FileUtils.copyStream(is, os, 2048);
      } catch (Exception e) {
        sendErrorResponse(response, e, uri);
      } finally {
        logRequest("GET", request, response);
      }
    }
Example #10
0
 @Override
 public double atd_impl(int i) {
   if (_len2 != _len) throw H2O.unimpl();
   if (_ds == null) return at8_impl(i);
   assert _xs == null;
   return _ds[i];
 }
Example #11
0
 // Received an ACK
 @Override
 public void onAck() {
   // remove local cache but NOT in case it is already on disk
   // (ie memory can be reclaimed and we assume we have plenty of disk space)
   if (_dontCache && !_xval.isPersisted()) H2O.putIfMatch(_xkey, null, _xval);
   if (_xval != null) _xval.completeRemotePut();
 }
Example #12
0
 private static void ignoreAndWait(final Exception e, boolean printException) {
   H2O.ignore(e, "Hit HDFS reset problem, retrying...", printException);
   try {
     Thread.sleep(500);
   } catch (InterruptedException ie) {
   }
 }
Example #13
0
  // If running on self, just submit to queues & do locally
  private RPC<V> handleLocal() {
    assert _dt.getCompleter() == null;
    _dt.setCompleter(
        new H2O.H2OCallback<DTask>() {
          @Override
          public void callback(DTask dt) {
            synchronized (RPC.this) {
              _done = true;
              RPC.this.notifyAll();
            }
            doAllCompletions();
          }

          @Override
          public boolean onExceptionalCompletion(Throwable ex, CountedCompleter dt) {
            synchronized (RPC.this) { // Might be called several times
              if (_done) return true; // Filter down to 1st exceptional completion
              _dt.setException(ex);
              // must be the last set before notify call cause the waiting thread
              // can wake up at any moment independently on notify
              _done = true;
              RPC.this.notifyAll();
            }
            doAllCompletions();
            return true;
          }
        });
    H2O.submitTask(_dt);
    return this;
  }
Example #14
0
  public final Vec[] vecs() {
    if (_vecs != null) return _vecs;
    // Load all Vec headers; load them all in parallel by spawning F/J tasks.
    final Vec[] vecs = new Vec[_keys.length];
    Futures fs = new Futures();
    for (int i = 0; i < _keys.length; i++) {
      final int ii = i;
      final Key k = _keys[i];
      H2OCountedCompleter t =
          new H2OCountedCompleter() {
            // We need higher priority here as there is a danger of deadlock in
            // case of many calls from MRTask2 at once (e.g. frame with many
            // vectors invokes rollup tasks for all vectors in parallel).  Should
            // probably be done in CPS style in the future
            @Override
            public byte priority() {
              return H2O.MIN_HI_PRIORITY;
            }

            @Override
            public void compute2() {
              vecs[ii] = DKV.get(k).get();
              tryComplete();
            }
          };
      H2O.submitTask(t);
      fs.add(t);
    }
    fs.blockForPending();
    return _vecs = vecs;
  }
Example #15
0
 // Read up to 'len' bytes of Value. Value should already be persisted to
 // disk.  A racing delete can trigger a failure where we get a null return,
 // but no crash (although one could argue that a racing load&delete is a bug
 // no matter what).
 @Override
 public byte[] load(Value v) {
   long skip = 0;
   Key k = v._key;
   // Convert an arraylet chunk into a long-offset from the base file.
   if (k._kb[0] == Key.ARRAYLET_CHUNK) {
     skip = ValueArray.getChunkOffset(k); // The offset
     k = ValueArray.getArrayKey(k); // From the base file key
   }
   if (k._kb[0] == Key.DVEC) {
     skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
   }
   try {
     FileInputStream s = null;
     try {
       s = new FileInputStream(getFileForKey(k));
       FileChannel fc = s.getChannel();
       fc.position(skip);
       AutoBuffer ab = new AutoBuffer(fc, true, Value.NFS);
       byte[] b = ab.getA1(v._max);
       ab.close();
       assert v.isPersisted();
       return b;
     } finally {
       if (s != null) s.close();
     }
   } catch (IOException e) { // Broken disk / short-file???
     H2O.ignore(e);
     return null;
   }
 }
Example #16
0
    @Override
    protected void doPost(HttpServletRequest request, HttpServletResponse response)
        throws IOException, ServletException {
      String uri = getDecodedUri(request);
      try {
        Pattern p = Pattern.compile(".*NodePersistentStorage.bin/([^/]+)/([^/]+)");
        Matcher m = p.matcher(uri);
        boolean b = m.matches();
        if (!b) {
          setResponseStatus(response, HttpServletResponse.SC_BAD_REQUEST);
          response.getWriter().write("Improperly formatted URI");
          return;
        }

        String categoryName = m.group(1);
        String keyName = m.group(2);

        InputStream is = extractPartInputStream(request, response);
        if (is == null) {
          return;
        }

        H2O.getNPS().put(categoryName, keyName, is);
        long length = H2O.getNPS().get_length(categoryName, keyName);
        String responsePayload =
            "{ "
                + "\"category\" : "
                + "\""
                + categoryName
                + "\", "
                + "\"name\" : "
                + "\""
                + keyName
                + "\", "
                + "\"total_bytes\" : "
                + length
                + " "
                + "}\n";
        response.setContentType("application/json");
        response.getWriter().write(responsePayload);
      } catch (Exception e) {
        sendErrorResponse(response, e, uri);
      } finally {
        logRequest("POST", request, response);
      }
    }
Example #17
0
 public void handle(
     String target,
     Request baseRequest,
     HttpServletRequest request,
     HttpServletResponse response)
     throws IOException, ServletException {
   H2O.getJetty().handle1(target, baseRequest, request, response);
 }
Example #18
0
 protected void testExecFail(String expr, int errorPos) {
   DKV.write_barrier();
   int keys = H2O.store_size();
   try {
     int i = UNIQUE.getAndIncrement();
     System.err.println("result" + (new Integer(i).toString()) + ": " + expr);
     Key key = Exec.exec(expr, "result" + (new Integer(i).toString()));
     UKV.remove(key);
     assertTrue("An exception should have been thrown.", false);
   } catch (ParserException e) {
     assertTrue(false);
   } catch (EvaluationException e) {
     if (errorPos != -1) assertEquals(errorPos, e._pos);
   }
   DKV.write_barrier();
   assertEquals("Keys were not properly deleted for expression " + expr, keys, H2O.store_size());
 }
 protected ModelMetricsListSchemaV3 schema(int version) {
   switch (version) {
     case 3:
       return new ModelMetricsListSchemaV3();
     default:
       throw H2O.fail("Bad version for ModelMetrics schema: " + version);
   }
 }
Example #20
0
 /**
  * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on
  * shuffled OOB rows and variable importance is recomputed.
  *
  * <p>The <a
  * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says:
  * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes
  * cast for the correct class. Now randomly permute the values of variable m in the oob cases and
  * put these cases down the tree. Subtract the number of votes for the correct class in the
  * variable-m-permuted oob data from the number of votes for the correct class in the untouched
  * oob data. The average of this number over all trees in the forest is the raw importance score
  * for variable m." </cite>
  */
 @Override
 protected VarImp doVarImpCalc(
     final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) {
   // Check if we have already serialized 'ktrees'-trees in the model
   assert model.ntrees() - 1 == tid
       : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid;
   assert _treeMeasuresOnOOB.npredictors() - 1 == tid
       : "Tree votes over OOB rows for this tree (var ktrees) were not found!";
   // Compute tree votes over shuffled data
   final CompressedTree[ /*nclass*/] theTree =
       model.ctree(tid); // get the last tree FIXME we should pass only keys
   final int nclasses = model.nclasses();
   Futures fs = new Futures();
   for (int var = 0; var < _ncols; var++) {
     final int variable = var;
     H2OCountedCompleter task4var =
         classification
             ? new H2OCountedCompleter() {
               @Override
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeVotes cd =
                     TreeMeasuresCollector.collectVotes(
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
                 asVotes(_treeMeasuresOnSOOB[variable]).append(cd);
                 tryComplete();
               }
             }
             : /* regression */ new H2OCountedCompleter() {
               @Override
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeSSE cd =
                     TreeMeasuresCollector.collectSSE(
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
                 asSSE(_treeMeasuresOnSOOB[variable]).append(cd);
                 tryComplete();
               }
             };
     H2O.submitTask(task4var); // Fork computation
     fs.add(task4var);
   }
   fs.blockForPending(); // Wait for results
   // Compute varimp for individual features (_ncols)
   final float[] varimp = new float[_ncols]; // output variable importance
   final float[] varimpSD = new float[_ncols]; // output variable importance sd
   for (int var = 0; var < _ncols; var++) {
     double[ /*2*/] imp =
         classification
             ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB))
             : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB));
     varimp[var] = (float) imp[0];
     varimpSD[var] = (float) imp[1];
   }
   return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees());
 }
Example #21
0
 // Compute a compressed integer buffer
 private byte[] bufX(long bias, int scale, int off, int log) {
   if (_len2 != _len) cancel_sparse();
   byte[] bs = new byte[(_len2 << log) + off];
   for (int i = 0; i < _len; i++) {
     if (isNA(i)) {
       switch (log) {
         case 0:
           bs[i + off] = (byte) (C1Chunk._NA);
           break;
         case 1:
           UDP.set2(bs, (i << 1) + off, (short) C2Chunk._NA);
           break;
         case 2:
           UDP.set4(bs, (i << 2) + off, (int) C4Chunk._NA);
           break;
         case 3:
           UDP.set8(bs, (i << 3) + off, C8Chunk._NA);
           break;
         default:
           H2O.fail();
       }
     } else {
       int x = (_xs[i] == Integer.MIN_VALUE + 1 ? 0 : _xs[i]) - scale;
       long le = x >= 0 ? _ls[i] * DParseTask.pow10i(x) : _ls[i] / DParseTask.pow10i(-x);
       le -= bias;
       switch (log) {
         case 0:
           bs[i + off] = (byte) le;
           break;
         case 1:
           UDP.set2(bs, (i << 1) + off, (short) le);
           break;
         case 2:
           UDP.set4(bs, (i << 2) + off, (int) le);
           break;
         case 3:
           UDP.set8(bs, (i << 3) + off, le);
           break;
         default:
           H2O.fail();
       }
     }
   }
   return bs;
 }
Example #22
0
  // --------------------------------------------------------------------------
  // Build an entire layer of all K trees
  protected DHistogram[][][] buildLayer(
      final Frame fr,
      final int nbins,
      int nbins_cats,
      final DTree ktrees[],
      final int leafs[],
      final DHistogram hcs[][][],
      boolean subset,
      boolean build_tree_one_node) {
    // Build K trees, one per class.

    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass];
    Vec vecs[] = fr.vecs();
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      // Build a frame with just a single tree (& work & nid) columns, so the
      // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try
      // to close other tree's Vecs when run in parallel.
      Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
      fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]);
      fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]);
      fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]);
      if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]);
      // Start building one of the K trees in parallel
      H2O.submitTask(
          sb1ts[k] =
              new ScoreBuildOneTree(
                  this,
                  k,
                  nbins,
                  nbins_cats,
                  tree,
                  leafs,
                  hcs,
                  fr2,
                  subset,
                  build_tree_one_node,
                  _improvPerVar,
                  _model._parms._distribution));
    }
    // Block for all K trees to complete.
    boolean did_split = false;
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      sb1ts[k].join();
      if (sb1ts[k]._did_split) did_split = true;
    }
    // The layer is done.
    return did_split ? hcs : null;
  }
Example #23
0
  @Override
  public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);
    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);
    final int nsplits = templates.length;
    assert nsplits == ratios.length + 1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s = 0; s < nsplits; s++) {
      Frame split = new Frame(destKeys[s], dataset.names(), templates[s]);
      split.delete_and_lock(jobKey);
      splits[s] = split;
    }
    setPendingCount(1);
    H2O.submitTask(
        new H2OCountedCompleter(FrameSplitter.this) {
          @Override
          public void compute2() {
            setPendingCount(nsplits);
            for (int s = 0; s < nsplits; s++) {
              new FrameSplitTask(
                      new H2OCountedCompleter(this) { // Completer for this task
                        @Override
                        public void compute2() {}

                        @Override
                        public boolean onExceptionalCompletion(
                            Throwable ex, CountedCompleter caller) {
                          synchronized (
                              FrameSplitter
                                  .this) { // synchronized on this since can be accessed from
                            // different workers
                            workersExceptions =
                                workersExceptions != null
                                    ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1)
                                    : new Throwable[1];
                            workersExceptions[workersExceptions.length - 1] = ex;
                          }
                          tryComplete(); // we handle the exception so wait perform normal
                          // completion
                          return false;
                        }
                      },
                      datasetVecs,
                      ratios,
                      s)
                  .asyncExec(splits[s]);
            }
            tryComplete(); // complete the computation of nsplits-tasks
          }
        });
    tryComplete(); // complete the computation of thrown tasks
  }
Example #24
0
 @Override
 public void dinvoke(H2ONode sender) {
   assert _key.home()
       || _val == null; // Only PUT to home for keys, or remote invalidation from home
   Paxos.lockCloud();
   // Initialize Value for having a single known replica (the sender)
   if (_val != null) _val.initReplicaHome(sender, _key);
   // Spin, until we update something.
   Value old = H2O.raw_get(_key); // Raw-get: do not lazy-manifest if overwriting
   while (H2O.putIfMatch(_key, _val, old) != old)
     old = H2O.raw_get(_key); // Repeat until we update something.
   // Invalidate remote caches.  Block, so that all invalidates are done
   // before we return to the remote caller.
   if (_key.home() && old != null) old.lockAndInvalidate(sender, new Futures()).blockForPending();
   // No return result
   _key = null;
   _val = null;
   tryComplete();
 }
Example #25
0
 ParseWriter streamParse(final InputStream is, final ParseWriter dout) throws IOException {
   if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl();
   StreamData din = new StreamData(is);
   int cidx = 0;
   // FIXME leaving _jobKey == null until sampling is done, this mean entire zip files
   // FIXME are parsed for parseSetup
   while (is.available() > 0
       && (_jobKey == null || !((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed()))
     parseChunk(cidx++, din, dout);
   parseChunk(cidx, din, dout); // Parse the remaining partial 32K buffer
   return dout;
 }
Example #26
0
 @Override
 boolean setNA_impl(int i) {
   if (isNA(i)) return true;
   if (_len2 != _len) throw H2O.unimpl();
   if (_ls != null) {
     _ls[i] = 0;
     _xs[i] = Integer.MIN_VALUE;
   }
   if (_ds != null) {
     _ds[i] = Double.NaN;
   }
   return true;
 }
Example #27
0
 public boolean isSigmaScaled() {
   switch (this) {
     case NONE:
     case DEMEAN:
     case NORMALIZE:
       return false;
     case STANDARDIZE:
     case DESCALE:
       return true;
     default:
       throw H2O.unimpl();
   }
 }
Example #28
0
  // Make new Keys.  Optimistically attempt interning, but no guarantee.
  public static Key make(byte[] kb, byte rf) {
    if (rf == -1) throw new IllegalArgumentException();
    Key key = new Key(kb);
    Key key2 = H2O.getk(key); // Get the interned version, if any
    if (key2 != null) // There is one! Return it instead
    return key2;

    // Set the cache with desired replication factor, and a fake cloud index
    H2O cloud = H2O.CLOUD; // Read once
    key._cache = build_cache(cloud._idx - 1, 0, 0, rf);
    key.cloud_info(cloud); // Now compute & cache the real data
    return key;
  }
Example #29
0
  // *Desired* distribution function on keys & replication factor. Replica #0
  // is the master, replica #1, 2, 3, etc represent additional desired
  // replication nodes. Note that this function is just the distribution
  // function - it does not DO any replication, nor does it dictate any policy
  // on how fast replication occurs. Returns -1 if the desired replica
  // is nonsense, e.g. asking for replica #3 in a 2-Node system.
  int D(int repl) {
    int hsz = H2O.CLOUD.size();

    // See if this is a specifically homed Key
    if (!user_allowed() && repl < _kb[1]) { // Asking for a replica# from the homed list?
      assert _kb[0] != Key.DVEC;
      H2ONode h2o = H2ONode.intern(_kb, 2 + repl * (4 + 2 /*serialized bytesize of H2OKey*/));
      // Reverse the home to the index
      int idx = h2o.index();
      if (idx >= 0) return idx;
      // Else homed to a node which is no longer in the cloud!
      // Fall back to the normal home mode
    }

    // Distribution of Fluid Vectors is a special case.
    // Fluid Vectors are grouped into vector groups, each of which must have
    // the same distribution of chunks so that MRTask2 run over group of
    // vectors will keep data-locality.  The fluid vecs from the same group
    // share the same key pattern + each has 4 bytes identifying particular
    // vector in the group.  Since we need the same chunks end up on the same
    // node in the group, we need to skip the 4 bytes containing vec# from the
    // hash.  Apart from that, we keep the previous mode of operation, so that
    // ByteVec would have first 64MB distributed around cloud randomly and then
    // go round-robin in 64MB chunks.
    if (_kb[0] == DVEC) {
      // Homed Chunk?
      if (_kb[1] != -1) throw H2O.unimpl();
      // For round-robin on Chunks in the following pattern:
      // 1 Chunk-per-node, until all nodes have 1 chunk (max parallelism).
      // Then 2 chunks-per-node, once around, then 4, then 8, then 16.
      // Getting several chunks-in-a-row on a single Node means that stencil
      // calculations that step off the end of one chunk into the next won't
      // force a chunk local - replicating the data.  If all chunks round robin
      // exactly, then any stencil calc will double the cached volume of data
      // (every node will have it's own chunk, plus a cached next-chunk).
      // Above 16-chunks-in-a-row we hit diminishing returns.
      int cidx = UnsafeUtils.get4(_kb, 1 + 1 + 4); // Chunk index
      int x = cidx / hsz; // Multiples of cluster size
      // 0 -> 1st trip around the cluster;            nidx= (cidx- 0*hsz)>>0
      // 1,2 -> 2nd & 3rd trip; allocate in pairs:    nidx= (cidx- 1*hsz)>>1
      // 3,4,5,6 -> next 4 rounds; allocate in quads: nidx= (cidx- 3*hsz)>>2
      // 7-14 -> next 8 rounds in octets:             nidx= (cidx- 7*hsz)>>3
      // 15+ -> remaining rounds in groups of 16:     nidx= (cidx-15*hsz)>>4
      int z = x == 0 ? 0 : (x <= 2 ? 1 : (x <= 6 ? 2 : (x <= 14 ? 3 : 4)));
      int nidx = (cidx - ((1 << z) - 1) * hsz) >> z;
      return ((nidx + repl) & 0x7FFFFFFF) % hsz;
    }

    // Easy Cheesy Stupid:
    return ((_hash + repl) & 0x7FFFFFFF) % hsz;
  }
Example #30
0
 @Override
 boolean set_impl(int i, double d) {
   if (_ls != null) { // Flip to using doubles
     if (_len2 != _len) throw H2O.unimpl();
     double ds[] = MemoryManager.malloc8d(_len);
     for (int j = 0; j < _len; j++)
       ds[j] = (isNA(j) || isEnum(j)) ? Double.NaN : _ls[j] * Math.pow(10, _xs[j]);
     _ds = ds;
     _ls = null;
     _xs = null;
   }
   _ds[i] = d;
   return true;
 }