private static void addFolder2( FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) { try { if (fs == null) return; Futures futures = new Futures(); for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder2(fs, pfs, keys, failed); } else { long size = file.getLen(); Key res; if (pfs.getName().endsWith(Extensions.JSON)) { throw H2O.unimpl(); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? throw H2O.unimpl(); } else { Key k = null; keys.add((k = HdfsFileVec.make(file, futures)).toString()); Log.info("PersistHdfs: DKV.put(" + k + ")"); } } } } catch (Exception e) { Log.err(e); failed.add(p.toString()); } }
@Override public void map(Chunk chks[], NewChunk nchks[]) { long rstart = chks[0]._start; int rlen = chks[0]._len; // Total row count int rx = 0; // Which row to in/ex-clude int rlo = 0; // Lo/Hi for this block of rows int rhi = rlen; while (true) { // Still got rows to include? if (_rows != null) { // Got a row selector? if (rx >= _rows.length) break; // All done with row selections long r = _rows[rx++] - 1; // Next row selector if (r < 0) { // Row exclusion if (rx > 0 && _rows[rx - 1] < _rows[rx]) throw H2O.unimpl(); long er = Math.abs(r) - 2; if (er < rstart) continue; // scoop up all of the rows before the first exclusion if (rx == 1 && ((int) (er + 1 - rstart)) > 0 && _ex) { rlo = (int) rstart; rhi = (int) (er - rstart); _ex = false; rx--; } else { rlo = (int) (er + 1 - rstart); // TODO: handle jumbled row indices ( e.g. -c(1,5,3) ) while (rx < _rows.length && (_rows[rx] + 1 == _rows[rx - 1] && rlo < rlen)) { if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl(); rx++; rlo++; // Exclude consecutive rows } rhi = rx >= _rows.length ? rlen : (int) Math.abs(_rows[rx] - 1) - 2; if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl(); } } else { // Positive row list? if (r < rstart) continue; rlo = (int) (r - rstart); rhi = rlo + 1; // Stop at the next row while (rx < _rows.length && (_rows[rx] - 1 - rstart) == rhi && rhi < rlen) { rx++; rhi++; // Grab sequential rows } } } // Process this next set of rows // For all cols in the new set for (int i = 0; i < _cols.length; i++) { Chunk oc = chks[_cols[i]]; NewChunk nc = nchks[i]; if (oc._vec.isInt()) { // Slice on integer columns for (int j = rlo; j < rhi; j++) if (oc.isNA0(j)) nc.addNA(); else nc.addNum(oc.at80(j), 0); } else { // Slice on double columns for (int j = rlo; j < rhi; j++) nc.addNum(oc.at0(j)); } } rlo = rhi; if (_rows == null) break; } }
// Set & At on NewChunks are weird: only used after inflating some other // chunk. At this point the NewChunk is full size, no more appends allowed, // and the xs exponent array should be only full of zeros. Accesses must be // in-range and refer to the inflated values of the original Chunk. @Override boolean set_impl(int i, long l) { if (_ds != null) throw H2O.unimpl(); if (_len2 != _len) throw H2O.unimpl(); _ls[i] = l; _xs[i] = 0; return true; }
public static void stall_till_cloudsize(String[] args, int x) { if (!_stall_called_before) { H2O.main(args); H2O.registerRestApis(System.getProperty("user.dir")); _stall_called_before = true; } H2O.waitForCloudSize(x, 30000); _initial_keycnt = H2O.store_size(); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Compute the variable args. Find the common row count Val vals[] = new Val[asts.length]; Vec vec = null; for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { Vec anyvec = vals[i].getFrame().anyVec(); if (anyvec == null) continue; // Ignore the empty frame if (vec == null) vec = anyvec; else if (vec.length() != anyvec.length()) throw new IllegalArgumentException( "cbind frames must have all the same rows, found " + vec.length() + " and " + anyvec.length() + " rows."); } } boolean clean = false; if (vec == null) { vec = Vec.makeZero(1); clean = true; } // Default to length 1 // Populate the new Frame Frame fr = new Frame(); for (int i = 1; i < asts.length; i++) { switch (vals[i].type()) { case Val.FRM: fr.add(fr.makeCompatible(vals[i].getFrame())); break; case Val.FUN: throw H2O.unimpl(); case Val.STR: throw H2O.unimpl(); case Val.NUM: // Auto-expand scalars to fill every row double d = vals[i].getNum(); fr.add(Double.toString(d), vec.makeCon(d)); break; default: throw H2O.unimpl(); } } if (clean) vec.remove(); return new ValFrame(fr); }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); switch (t) { case STANDARDIZE: normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[i] = v.mean(); break; case NORMALIZE: normMul[i] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; normSub[i] = v.mean(); break; case DEMEAN: normMul[i] = 1; normSub[i] = v.mean(); break; case DESCALE: normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[i] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[i]); assert !Double.isNaN(normSub[i]); } }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { int idx = 0; // idx!=i when interactions are in play, otherwise, it's just 'i' for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); boolean isIWV = isInteractionVec(vecStart + i); switch (t) { case STANDARDIZE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case NORMALIZE: normMul[idx] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DEMEAN: normMul[idx] = 1; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DESCALE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[idx]); assert !Double.isNaN(normSub[idx]); idx = isIWV ? (idx + nextNumericIdx(i)) : (idx + 1); } }
// ------------------------------------------------------------------------ // Zipped file; no parallel decompression; decompress into local chunks, // parse local chunks; distribute chunks later. ParseWriter streamParseZip(final InputStream is, final StreamParseWriter dout, InputStream bvs) throws IOException { // All output into a fresh pile of NewChunks, one per column if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl(); StreamData din = new StreamData(is); int cidx = 0; StreamParseWriter nextChunk = dout; int zidx = bvs.read(null, 0, 0); // Back-channel read of chunk index assert zidx == 1; while (is.available() > 0) { int xidx = bvs.read(null, 0, 0); // Back-channel read of chunk index if (xidx > zidx) { // Advanced chunk index of underlying ByteVec stream? zidx = xidx; // Record advancing of chunk nextChunk.close(); // Match output chunks to input zipfile chunks if (dout != nextChunk) { dout.reduce(nextChunk); if (_jobKey != null && ((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed()) break; } nextChunk = nextChunk.nextChunk(); } parseChunk(cidx++, din, nextChunk); } parseChunk(cidx, din, nextChunk); // Parse the remaining partial 32K buffer nextChunk.close(); if (dout != nextChunk) dout.reduce(nextChunk); return dout; }
@Override protected void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { String uri = getDecodedUri(request); try { Pattern p = Pattern.compile(".*/NodePersistentStorage.bin/([^/]+)/([^/]+)"); Matcher m = p.matcher(uri); boolean b = m.matches(); if (!b) { setResponseStatus(response, HttpServletResponse.SC_BAD_REQUEST); response.getWriter().write("Improperly formatted URI"); return; } String categoryName = m.group(1); String keyName = m.group(2); NodePersistentStorage nps = H2O.getNPS(); AtomicLong length = new AtomicLong(); InputStream is = nps.get(categoryName, keyName, length); if (length.get() > (long) Integer.MAX_VALUE) { throw new Exception("NPS value size exceeds Integer.MAX_VALUE"); } response.setContentType("application/octet-stream"); response.setContentLength((int) length.get()); response.addHeader("Content-Disposition", "attachment; filename=" + keyName + ".flow"); setResponseStatus(response, HttpServletResponse.SC_OK); OutputStream os = response.getOutputStream(); water.util.FileUtils.copyStream(is, os, 2048); } catch (Exception e) { sendErrorResponse(response, e, uri); } finally { logRequest("GET", request, response); } }
@Override public double atd_impl(int i) { if (_len2 != _len) throw H2O.unimpl(); if (_ds == null) return at8_impl(i); assert _xs == null; return _ds[i]; }
// Received an ACK @Override public void onAck() { // remove local cache but NOT in case it is already on disk // (ie memory can be reclaimed and we assume we have plenty of disk space) if (_dontCache && !_xval.isPersisted()) H2O.putIfMatch(_xkey, null, _xval); if (_xval != null) _xval.completeRemotePut(); }
private static void ignoreAndWait(final Exception e, boolean printException) { H2O.ignore(e, "Hit HDFS reset problem, retrying...", printException); try { Thread.sleep(500); } catch (InterruptedException ie) { } }
// If running on self, just submit to queues & do locally private RPC<V> handleLocal() { assert _dt.getCompleter() == null; _dt.setCompleter( new H2O.H2OCallback<DTask>() { @Override public void callback(DTask dt) { synchronized (RPC.this) { _done = true; RPC.this.notifyAll(); } doAllCompletions(); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter dt) { synchronized (RPC.this) { // Might be called several times if (_done) return true; // Filter down to 1st exceptional completion _dt.setException(ex); // must be the last set before notify call cause the waiting thread // can wake up at any moment independently on notify _done = true; RPC.this.notifyAll(); } doAllCompletions(); return true; } }); H2O.submitTask(_dt); return this; }
public final Vec[] vecs() { if (_vecs != null) return _vecs; // Load all Vec headers; load them all in parallel by spawning F/J tasks. final Vec[] vecs = new Vec[_keys.length]; Futures fs = new Futures(); for (int i = 0; i < _keys.length; i++) { final int ii = i; final Key k = _keys[i]; H2OCountedCompleter t = new H2OCountedCompleter() { // We need higher priority here as there is a danger of deadlock in // case of many calls from MRTask2 at once (e.g. frame with many // vectors invokes rollup tasks for all vectors in parallel). Should // probably be done in CPS style in the future @Override public byte priority() { return H2O.MIN_HI_PRIORITY; } @Override public void compute2() { vecs[ii] = DKV.get(k).get(); tryComplete(); } }; H2O.submitTask(t); fs.add(t); } fs.blockForPending(); return _vecs = vecs; }
// Read up to 'len' bytes of Value. Value should already be persisted to // disk. A racing delete can trigger a failure where we get a null return, // but no crash (although one could argue that a racing load&delete is a bug // no matter what). @Override public byte[] load(Value v) { long skip = 0; Key k = v._key; // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key } if (k._kb[0] == Key.DVEC) { skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset } try { FileInputStream s = null; try { s = new FileInputStream(getFileForKey(k)); FileChannel fc = s.getChannel(); fc.position(skip); AutoBuffer ab = new AutoBuffer(fc, true, Value.NFS); byte[] b = ab.getA1(v._max); ab.close(); assert v.isPersisted(); return b; } finally { if (s != null) s.close(); } } catch (IOException e) { // Broken disk / short-file??? H2O.ignore(e); return null; } }
@Override protected void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { String uri = getDecodedUri(request); try { Pattern p = Pattern.compile(".*NodePersistentStorage.bin/([^/]+)/([^/]+)"); Matcher m = p.matcher(uri); boolean b = m.matches(); if (!b) { setResponseStatus(response, HttpServletResponse.SC_BAD_REQUEST); response.getWriter().write("Improperly formatted URI"); return; } String categoryName = m.group(1); String keyName = m.group(2); InputStream is = extractPartInputStream(request, response); if (is == null) { return; } H2O.getNPS().put(categoryName, keyName, is); long length = H2O.getNPS().get_length(categoryName, keyName); String responsePayload = "{ " + "\"category\" : " + "\"" + categoryName + "\", " + "\"name\" : " + "\"" + keyName + "\", " + "\"total_bytes\" : " + length + " " + "}\n"; response.setContentType("application/json"); response.getWriter().write(responsePayload); } catch (Exception e) { sendErrorResponse(response, e, uri); } finally { logRequest("POST", request, response); } }
public void handle( String target, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { H2O.getJetty().handle1(target, baseRequest, request, response); }
protected void testExecFail(String expr, int errorPos) { DKV.write_barrier(); int keys = H2O.store_size(); try { int i = UNIQUE.getAndIncrement(); System.err.println("result" + (new Integer(i).toString()) + ": " + expr); Key key = Exec.exec(expr, "result" + (new Integer(i).toString())); UKV.remove(key); assertTrue("An exception should have been thrown.", false); } catch (ParserException e) { assertTrue(false); } catch (EvaluationException e) { if (errorPos != -1) assertEquals(errorPos, e._pos); } DKV.write_barrier(); assertEquals("Keys were not properly deleted for expression " + expr, keys, H2O.store_size()); }
protected ModelMetricsListSchemaV3 schema(int version) { switch (version) { case 3: return new ModelMetricsListSchemaV3(); default: throw H2O.fail("Bad version for ModelMetrics schema: " + version); } }
/** * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on * shuffled OOB rows and variable importance is recomputed. * * <p>The <a * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says: * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes * cast for the correct class. Now randomly permute the values of variable m in the oob cases and * put these cases down the tree. Subtract the number of votes for the correct class in the * variable-m-permuted oob data from the number of votes for the correct class in the untouched * oob data. The average of this number over all trees in the forest is the raw importance score * for variable m." </cite> */ @Override protected VarImp doVarImpCalc( final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) { // Check if we have already serialized 'ktrees'-trees in the model assert model.ntrees() - 1 == tid : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid; assert _treeMeasuresOnOOB.npredictors() - 1 == tid : "Tree votes over OOB rows for this tree (var ktrees) were not found!"; // Compute tree votes over shuffled data final CompressedTree[ /*nclass*/] theTree = model.ctree(tid); // get the last tree FIXME we should pass only keys final int nclasses = model.nclasses(); Futures fs = new Futures(); for (int var = 0; var < _ncols; var++) { final int variable = var; H2OCountedCompleter task4var = classification ? new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeVotes cd = TreeMeasuresCollector.collectVotes( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asVotes(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } } : /* regression */ new H2OCountedCompleter() { @Override public void compute2() { // Compute this tree votes over all data over given variable TreeSSE cd = TreeMeasuresCollector.collectSSE( theTree, nclasses, fTrain, _ncols, sample_rate, variable); assert cd.npredictors() == 1; asSSE(_treeMeasuresOnSOOB[variable]).append(cd); tryComplete(); } }; H2O.submitTask(task4var); // Fork computation fs.add(task4var); } fs.blockForPending(); // Wait for results // Compute varimp for individual features (_ncols) final float[] varimp = new float[_ncols]; // output variable importance final float[] varimpSD = new float[_ncols]; // output variable importance sd for (int var = 0; var < _ncols; var++) { double[ /*2*/] imp = classification ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB)) : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB)); varimp[var] = (float) imp[0]; varimpSD[var] = (float) imp[1]; } return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees()); }
// Compute a compressed integer buffer private byte[] bufX(long bias, int scale, int off, int log) { if (_len2 != _len) cancel_sparse(); byte[] bs = new byte[(_len2 << log) + off]; for (int i = 0; i < _len; i++) { if (isNA(i)) { switch (log) { case 0: bs[i + off] = (byte) (C1Chunk._NA); break; case 1: UDP.set2(bs, (i << 1) + off, (short) C2Chunk._NA); break; case 2: UDP.set4(bs, (i << 2) + off, (int) C4Chunk._NA); break; case 3: UDP.set8(bs, (i << 3) + off, C8Chunk._NA); break; default: H2O.fail(); } } else { int x = (_xs[i] == Integer.MIN_VALUE + 1 ? 0 : _xs[i]) - scale; long le = x >= 0 ? _ls[i] * DParseTask.pow10i(x) : _ls[i] / DParseTask.pow10i(-x); le -= bias; switch (log) { case 0: bs[i + off] = (byte) le; break; case 1: UDP.set2(bs, (i << 1) + off, (short) le); break; case 2: UDP.set4(bs, (i << 2) + off, (int) le); break; case 3: UDP.set8(bs, (i << 3) + off, le); break; default: H2O.fail(); } } } return bs; }
// -------------------------------------------------------------------------- // Build an entire layer of all K trees protected DHistogram[][][] buildLayer( final Frame fr, final int nbins, int nbins_cats, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) { // Build K trees, one per class. // Build up the next-generation tree splits from the current histograms. // Nearly all leaves will split one more level. This loop nest is // O( #active_splits * #bins * #ncols ) // but is NOT over all the data. ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass]; Vec vecs[] = fr.vecs(); for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; // Build a frame with just a single tree (& work & nid) columns, so the // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try // to close other tree's Vecs when run in parallel. Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1)); fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]); fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]); fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]); if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]); // Start building one of the K trees in parallel H2O.submitTask( sb1ts[k] = new ScoreBuildOneTree( this, k, nbins, nbins_cats, tree, leafs, hcs, fr2, subset, build_tree_one_node, _improvPerVar, _model._parms._distribution)); } // Block for all K trees to complete. boolean did_split = false; for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; sb1ts[k].join(); if (sb1ts[k]._did_split) did_split = true; } // The layer is done. return did_split ? hcs : null; }
@Override public void compute2() { // Lock all possible data dataset.read_lock(jobKey); // Create a template vector for each segment final Vec[][] templates = makeTemplates(dataset, ratios); final int nsplits = templates.length; assert nsplits == ratios.length + 1 : "Unexpected number of split templates!"; // Launch number of distributed FJ for each split part final Vec[] datasetVecs = dataset.vecs(); splits = new Frame[nsplits]; for (int s = 0; s < nsplits; s++) { Frame split = new Frame(destKeys[s], dataset.names(), templates[s]); split.delete_and_lock(jobKey); splits[s] = split; } setPendingCount(1); H2O.submitTask( new H2OCountedCompleter(FrameSplitter.this) { @Override public void compute2() { setPendingCount(nsplits); for (int s = 0; s < nsplits; s++) { new FrameSplitTask( new H2OCountedCompleter(this) { // Completer for this task @Override public void compute2() {} @Override public boolean onExceptionalCompletion( Throwable ex, CountedCompleter caller) { synchronized ( FrameSplitter .this) { // synchronized on this since can be accessed from // different workers workersExceptions = workersExceptions != null ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1) : new Throwable[1]; workersExceptions[workersExceptions.length - 1] = ex; } tryComplete(); // we handle the exception so wait perform normal // completion return false; } }, datasetVecs, ratios, s) .asyncExec(splits[s]); } tryComplete(); // complete the computation of nsplits-tasks } }); tryComplete(); // complete the computation of thrown tasks }
@Override public void dinvoke(H2ONode sender) { assert _key.home() || _val == null; // Only PUT to home for keys, or remote invalidation from home Paxos.lockCloud(); // Initialize Value for having a single known replica (the sender) if (_val != null) _val.initReplicaHome(sender, _key); // Spin, until we update something. Value old = H2O.raw_get(_key); // Raw-get: do not lazy-manifest if overwriting while (H2O.putIfMatch(_key, _val, old) != old) old = H2O.raw_get(_key); // Repeat until we update something. // Invalidate remote caches. Block, so that all invalidates are done // before we return to the remote caller. if (_key.home() && old != null) old.lockAndInvalidate(sender, new Futures()).blockForPending(); // No return result _key = null; _val = null; tryComplete(); }
ParseWriter streamParse(final InputStream is, final ParseWriter dout) throws IOException { if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl(); StreamData din = new StreamData(is); int cidx = 0; // FIXME leaving _jobKey == null until sampling is done, this mean entire zip files // FIXME are parsed for parseSetup while (is.available() > 0 && (_jobKey == null || !((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed())) parseChunk(cidx++, din, dout); parseChunk(cidx, din, dout); // Parse the remaining partial 32K buffer return dout; }
@Override boolean setNA_impl(int i) { if (isNA(i)) return true; if (_len2 != _len) throw H2O.unimpl(); if (_ls != null) { _ls[i] = 0; _xs[i] = Integer.MIN_VALUE; } if (_ds != null) { _ds[i] = Double.NaN; } return true; }
public boolean isSigmaScaled() { switch (this) { case NONE: case DEMEAN: case NORMALIZE: return false; case STANDARDIZE: case DESCALE: return true; default: throw H2O.unimpl(); } }
// Make new Keys. Optimistically attempt interning, but no guarantee. public static Key make(byte[] kb, byte rf) { if (rf == -1) throw new IllegalArgumentException(); Key key = new Key(kb); Key key2 = H2O.getk(key); // Get the interned version, if any if (key2 != null) // There is one! Return it instead return key2; // Set the cache with desired replication factor, and a fake cloud index H2O cloud = H2O.CLOUD; // Read once key._cache = build_cache(cloud._idx - 1, 0, 0, rf); key.cloud_info(cloud); // Now compute & cache the real data return key; }
// *Desired* distribution function on keys & replication factor. Replica #0 // is the master, replica #1, 2, 3, etc represent additional desired // replication nodes. Note that this function is just the distribution // function - it does not DO any replication, nor does it dictate any policy // on how fast replication occurs. Returns -1 if the desired replica // is nonsense, e.g. asking for replica #3 in a 2-Node system. int D(int repl) { int hsz = H2O.CLOUD.size(); // See if this is a specifically homed Key if (!user_allowed() && repl < _kb[1]) { // Asking for a replica# from the homed list? assert _kb[0] != Key.DVEC; H2ONode h2o = H2ONode.intern(_kb, 2 + repl * (4 + 2 /*serialized bytesize of H2OKey*/)); // Reverse the home to the index int idx = h2o.index(); if (idx >= 0) return idx; // Else homed to a node which is no longer in the cloud! // Fall back to the normal home mode } // Distribution of Fluid Vectors is a special case. // Fluid Vectors are grouped into vector groups, each of which must have // the same distribution of chunks so that MRTask2 run over group of // vectors will keep data-locality. The fluid vecs from the same group // share the same key pattern + each has 4 bytes identifying particular // vector in the group. Since we need the same chunks end up on the same // node in the group, we need to skip the 4 bytes containing vec# from the // hash. Apart from that, we keep the previous mode of operation, so that // ByteVec would have first 64MB distributed around cloud randomly and then // go round-robin in 64MB chunks. if (_kb[0] == DVEC) { // Homed Chunk? if (_kb[1] != -1) throw H2O.unimpl(); // For round-robin on Chunks in the following pattern: // 1 Chunk-per-node, until all nodes have 1 chunk (max parallelism). // Then 2 chunks-per-node, once around, then 4, then 8, then 16. // Getting several chunks-in-a-row on a single Node means that stencil // calculations that step off the end of one chunk into the next won't // force a chunk local - replicating the data. If all chunks round robin // exactly, then any stencil calc will double the cached volume of data // (every node will have it's own chunk, plus a cached next-chunk). // Above 16-chunks-in-a-row we hit diminishing returns. int cidx = UnsafeUtils.get4(_kb, 1 + 1 + 4); // Chunk index int x = cidx / hsz; // Multiples of cluster size // 0 -> 1st trip around the cluster; nidx= (cidx- 0*hsz)>>0 // 1,2 -> 2nd & 3rd trip; allocate in pairs: nidx= (cidx- 1*hsz)>>1 // 3,4,5,6 -> next 4 rounds; allocate in quads: nidx= (cidx- 3*hsz)>>2 // 7-14 -> next 8 rounds in octets: nidx= (cidx- 7*hsz)>>3 // 15+ -> remaining rounds in groups of 16: nidx= (cidx-15*hsz)>>4 int z = x == 0 ? 0 : (x <= 2 ? 1 : (x <= 6 ? 2 : (x <= 14 ? 3 : 4))); int nidx = (cidx - ((1 << z) - 1) * hsz) >> z; return ((nidx + repl) & 0x7FFFFFFF) % hsz; } // Easy Cheesy Stupid: return ((_hash + repl) & 0x7FFFFFFF) % hsz; }
@Override boolean set_impl(int i, double d) { if (_ls != null) { // Flip to using doubles if (_len2 != _len) throw H2O.unimpl(); double ds[] = MemoryManager.malloc8d(_len); for (int j = 0; j < _len; j++) ds[j] = (isNA(j) || isEnum(j)) ? Double.NaN : _ls[j] * Math.pow(10, _xs[j]); _ds = ds; _ls = null; _xs = null; } _ds[i] = d; return true; }