@Override protected Response serve() { try { // pull everything local Log.info("ExportFiles processing (" + path + ")"); if (DKV.get(src_key) == null) throw new IllegalArgumentException(src_key.toString() + " not found."); Object value = DKV.get(src_key).get(); // create a stream to read the entire VA or Frame if (!(value instanceof ValueArray) && !(value instanceof Frame)) throw new UnsupportedOperationException("Can only export Frames or ValueArrays."); InputStream csv = value instanceof ValueArray ? new ValueArray.CsvVAStream((ValueArray) value, null) : ((Frame) value).toCSV(true); String p2 = path.toLowerCase(); if (p2.startsWith("hdfs://")) serveHdfs(csv); else if (p2.startsWith("s3n://")) serveHdfs(csv); else serveLocalDisk(csv); return RequestBuilders.Response.done(this); } catch (Throwable t) { return RequestBuilders.Response.error(t); } }
@Override public float progress() { if (DKV.get(dest()) == null) return 0; GLMModel m = DKV.get(dest()).get(); float progress = (float) m.iteration() / (float) max_iter; // TODO, do something smarter here return progress; }
/** * Score a frame with the given model and return just the metrics. * * <p>NOTE: ModelMetrics are now always being created by model.score. . . */ @SuppressWarnings("unused") // called through reflection by RequestServer public ModelMetricsListSchemaV3 score(int version, ModelMetricsListSchemaV3 s) { // parameters checking: if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model); if (null == DKV.get(s.model.name)) throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name); if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame); if (null == DKV.get(s.frame.name)) throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name); ModelMetricsList parms = s.createAndFillImpl(); parms ._model .score(parms._frame, parms._predictions_name) .remove(); // throw away predictions, keep metrics as a side-effect ModelMetricsListSchemaV3 mm = this.fetch(version, s); // TODO: for now only binary predictors write an MM object. // For the others cons one up here to return the predictions frame. if (null == mm) mm = new ModelMetricsListSchemaV3(); if (null == mm.model_metrics || 0 == mm.model_metrics.length) { Log.warn( "Score() did not return a ModelMetrics for model: " + s.model + " on frame: " + s.frame); } return mm; }
/** Score a frame with the given model and return the metrics AND the prediction frame. */ @SuppressWarnings("unused") // called through reflection by RequestServer public JobV3 predict2(int version, final ModelMetricsListSchemaV3 s) { // parameters checking: if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model); if (null == DKV.get(s.model.name)) throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name); if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame); if (null == DKV.get(s.frame.name)) throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name); final ModelMetricsList parms = s.createAndFillImpl(); // predict2 does not return modelmetrics, so cannot handle deeplearning: reconstruction_error // (anomaly) or GLRM: reconstruct and archetypes // predict2 can handle deeplearning: deepfeatures and predict if (s.deep_features_hidden_layer > 0) { if (null == parms._predictions_name) parms._predictions_name = "deep_features" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); } else if (null == parms._predictions_name) parms._predictions_name = "predictions" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); final Job<Frame> j = new Job(Key.make(parms._predictions_name), Frame.class.getName(), "prediction"); H2O.H2OCountedCompleter work = new H2O.H2OCountedCompleter() { @Override public void compute2() { if (s.deep_features_hidden_layer < 0) { parms._model.score(parms._frame, parms._predictions_name, j); } else { Frame predictions = ((Model.DeepFeatures) parms._model) .scoreDeepFeatures(parms._frame, s.deep_features_hidden_layer, j); predictions = new Frame( Key.make(parms._predictions_name), predictions.names(), predictions.vecs()); DKV.put(predictions._key, predictions); } tryComplete(); } }; j.start(work, parms._frame.anyVec().nChunks()); return new JobV3().fillFromImpl(j); }
public static Response redirect(JsonObject fromPageResponse, Key rfModelKey) { RFModel rfModel = DKV.get(rfModelKey).get(); ValueArray data = DKV.get(rfModel._dataKey).get(); return redirect( fromPageResponse, null, rfModelKey, rfModel._dataKey, rfModel._totalTrees, data.numCols() - 1, null, true, false); }
@Override public Response serve() { Frame fr = DKV.get(data_key.value()).get(); if (fr == null) return RequestServer._http404.serve(); // Build a frame with the selected Vecs Frame fr2 = new Frame(new String[0], new Vec[0]); int[] idxs = vecs.value(); for (int idx : idxs) // The selected frame columns fr2.add(fr._names[idx], fr._vecs[idx]); // Add the class-vec last Vec cvec = class_vec.value(); fr2.add(fr._names[class_vec._colIdx.get()], cvec); domain = cvec.domain(); // Class/enum/factor names mtrys = features.value() == null ? (int) (Math.sqrt(idxs.length) + 0.5) : features.value(); DRF drf = DRF.start( DRF.makeKey(), fr2, depth.value(), ntrees.value(), mtrys, sample_rate.value(), seed.value()); drf.get(); // Block for result cm = drf.cm(); // Get CM result return new Response(Response.Status.done, this, -1, -1, null); }
public static ValueArray loadAndParseKey(Key okey, String path) { FileIntegrityChecker c = FileIntegrityChecker.check(new File(path),false); Key k = c.syncDirectory(null,null,null,null); ParseDataset.forkParseDataset(okey, new Key[] { k }, null).get(); UKV.remove(k); ValueArray res = DKV.get(okey).get(); return res; }
@Override public byte[] atomic(byte[] bits1) { byte[] mem = DKV.get(_key).get(); int len = Math.max(_dst_off + mem.length, bits1 == null ? 0 : bits1.length); byte[] bits2 = MemoryManager.malloc1(len); if (bits1 != null) System.arraycopy(bits1, 0, bits2, 0, bits1.length); System.arraycopy(mem, 0, bits2, _dst_off, mem.length); return bits2; }
private FrameTask(Key jobKey, Key dinfoKey, int[] activeCols, long seed, int iteration) { super(null); assert dinfoKey == null || DKV.get(dinfoKey) != null; _jobKey = jobKey; _dinfoKey = dinfoKey; _activeCols = activeCols; _seed = seed; _iteration = iteration; }
public Vec replace(int col, Vec nv) { assert col < _names.length; Vec rv = vecs()[col]; assert rv.group().equals(nv.group()); _vecs[col] = nv; _keys[col] = nv._key; if (DKV.get(nv._key) == null) // If not already in KV, put it there DKV.put(nv._key, nv); return rv; }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to * be fast; heavy-weight prep needs to wait for the trainModel() call. * * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to * predict on; validate a checkpoint. */ @Override public void init(boolean expensive) { super.init(expensive); if (H2O.ARGS.client && _parms._build_tree_one_node) error("_build_tree_one_node", "Cannot run on a single node in client mode"); if (_vresponse != null) _vresponse_key = _vresponse._key; if (_response != null) _response_key = _response._key; if (_nclass > SharedTreeModel.SharedTreeParameters.MAX_SUPPORTED_LEVELS) error("_nclass", "Too many levels in response column!"); if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0"); if (_parms._ntrees < 0 || _parms._ntrees > 100000) error("_ntrees", "Requested ntrees must be between 1 and 100000"); _ntrees = _parms._ntrees; // Total trees in final model if (_parms._checkpoint) { // Asking to continue from checkpoint? Value cv = DKV.get(_parms._model_id); if (cv != null) { // Look for prior model M checkpointModel = cv.get(); if (_parms._ntrees < checkpointModel._output._ntrees + 1) error( "_ntrees", "Requested ntrees must be between " + checkpointModel._output._ntrees + 1 + " and 100000"); _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees } } if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1."); if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16)); if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1."); if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16)); if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0."); if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0."); if (_parms._distribution == Distributions.Family.tweedie) { _parms._distribution.tweedie.p = _parms._tweedie_power; } if (_train != null) { double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1); if (sumWeights < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once error( "_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows + ": must have at least " + 2 * _parms._min_rows + " (weighted) rows, but have only " + sumWeights + "."); } if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols(); }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to * be fast; heavy-weight prep needs to wait for the trainModel() call. * * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to * predict on; validate a checkpoint. */ @Override public void init(boolean expensive) { super.init(expensive); if (H2O.ARGS.client && _parms._build_tree_one_node) error("_build_tree_one_node", "Cannot run on a single node in client mode"); if (_vresponse != null) _vresponse_key = _vresponse._key; if (_response != null) _response_key = _response._key; if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0"); if (_parms._ntrees < 0 || _parms._ntrees > MAX_NTREES) error("_ntrees", "Requested ntrees must be between 1 and " + MAX_NTREES); _ntrees = _parms._ntrees; // Total trees in final model if (_parms.hasCheckpoint()) { // Asking to continue from checkpoint? Value cv = DKV.get(_parms._checkpoint); if (cv != null) { // Look for prior model M checkpointModel = cv.get(); try { _parms.validateWithCheckpoint(checkpointModel._parms); } catch (H2OIllegalArgumentException e) { error(e.values.get("argument").toString(), e.values.get("value").toString()); } if (_parms._ntrees < checkpointModel._output._ntrees + 1) error( "_ntrees", "If checkpoint is specified then requested ntrees must be higher than " + (checkpointModel._output._ntrees + 1)); // Compute number of trees to build for this checkpoint _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees } } if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1."); if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16)); if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1."); if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16)); if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0."); if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0."); if (_train != null) { double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1); if (sumWeights < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once error( "_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows + ": must have at least " + 2 * _parms._min_rows + " (weighted) rows, but have only " + sumWeights + "."); } if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols(); }
// Test kaggle/creditsample-test data @org.junit.Test public void kaggle_credit() { Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz"); UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED")); UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED")); ValueArray val = DKV.get(okey).get(); // Check parsed dataset final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20]; assertEquals("Number of chunks", n, val.chunks()); assertEquals("Number of rows", 150000, val.numRows()); assertEquals("Number of cols", 12, val.numCols()); // setup default values for DRF int ntrees = 3; int depth = 30; int gini = StatType.GINI.ordinal(); int seed = 42; StatType statType = StatType.values()[gini]; final int cols[] = new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1 // Start the distributed Random Forest final Key modelKey = Key.make("model"); DRFJob result = hex.rf.DRF.execute( modelKey, cols, val, ntrees, depth, 1024, statType, seed, true, null, -1, Sampling.Strategy.RANDOM, 1.0f, null, 0, 0, false); // Wait for completion on all nodes RFModel model = result.get(); assertEquals("Number of classes", 2, model.classes()); assertEquals("Number of trees", ntrees, model.size()); model.deleteKeys(); UKV.remove(modelKey); UKV.remove(okey); }
@Override public void compute2() { String path = null; // getPathFromValue(val); ValueArray ary = DKV.get(_arykey).get(); Key self = selfKey(); while (_indexFrom < ary.chunks()) { Key ckey = ary.getChunkKey(_indexFrom++); if (!ckey.home()) { // Next chunk not At Home? RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk return; } Value val = DKV.get(ckey); // It IS home, so get the data _err = PersistHdfs.appendChunk(_arykey, val); if (_err != null) return; UKV.put(self, this); // Update the progress/self key } // We did the last chunk. Removing the selfKey is the signal to the web // thread that All Done. UKV.remove(self); }
// TODO: almost identical to ModelsHandler; refactor public static ModelMetrics getFromDKV(Key key) { if (null == key) throw new IllegalArgumentException("Got null key."); Value v = DKV.get(key); if (null == v) throw new IllegalArgumentException("Did not find key: " + key.toString()); Iced ice = v.get(); if (!(ice instanceof ModelMetrics)) throw new IllegalArgumentException( "Expected a Model for key: " + key.toString() + "; got a: " + ice.getClass()); return (ModelMetrics) ice; }
@Override protected void setupLocal() { _model_mem_size = 0; for (int i = 0; i < trees_so_far; ++i) { Key<CompressedTree>[] per_class = _treeKeys[i]; for (int j = 0; j < per_class.length; ++j) { if (per_class[j] == null) continue; if (!per_class[j].home()) continue; // only look at homed tree keys _model_mem_size += DKV.get(per_class[j])._max; } } }
public static Frame createFrame(String fname, long[] chunkLayout, String[][] data) { Frame f = new Frame(Key.make(fname)); f.preparePartialFrame(new String[] {"C0"}); f.update(null); // Create chunks for (int i = 0; i < chunkLayout.length; i++) { createNC(fname, data[i], i, (int) chunkLayout[i]); } // Reload frame from DKV f = DKV.get(fname).get(); // Finalize frame f.finalizePartialFrame(chunkLayout, new String[][] {null}, new byte[] {Vec.T_STR}); return f; }
@Test public void testChunks() { Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data"); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 3.0; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.418 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); Frame output = agg._output._output_frame.get(); Log.info("Number of exemplars: " + agg._exemplars.length); // Assert.assertTrue(agg._exemplars.length==1993); output.remove(); agg.remove(); for (int i : new int[] {1, 2, 5, 10, 50, 100}) { Key key = Key.make(); RebalanceDataSet rb = new RebalanceDataSet(frame, key, i); H2O.submitTask(rb); rb.join(); Frame rebalanced = DKV.get(key).get(); parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 3.0; start = System.currentTimeMillis(); AggregatorModel agg2 = new Aggregator(parms).trainModel().get(); // 0.373 0.504 0.357 0.454 0.368 0.355 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg2.checkConsistency(); Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length); rebalanced.delete(); Assert.assertTrue( Math.abs(agg._exemplars.length - agg2._exemplars.length) == 0); // < agg._exemplars.length*0); output = agg2._output._output_frame.get(); output.remove(); agg2.remove(); } frame.delete(); }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; final Path p; if (_iceRoot != null) { p = new Path(_iceRoot, getIceName(v)); } else { // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key if (k.toString().endsWith(Extensions.HEX)) { // Hex file? int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header? skip += value_len; } } p = new Path(k.toString()); } final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
// Convert a chunk# into a chunk - does lazy-chunk creation. As chunks are // asked-for the first time, we make the Key and an empty backing DVec. // Touching the DVec will force the file load. @Override public Value chunkIdx(int cidx) { final long nchk = nChunks(); assert 0 <= cidx && cidx < nchk; Key dkey = chunkKey(cidx); Value val1 = DKV.get(dkey); // Check for an existing one... will fetch data as needed if (val1 != null) return val1; // Found an existing one? // Lazily create a DVec for this chunk int len = (int) (cidx < nchk - 1 ? ValueArray.CHUNK_SZ : (_len - chunk2StartElem(cidx))); // DVec is just the raw file data with a null-compression scheme Value val2 = new Value(dkey, len, null, TypeMap.C1CHUNK, Value.NFS); val2.setdsk(); // It is already on disk. // Atomically insert: fails on a race, but then return the old version Value val3 = DKV.DputIfMatch(dkey, val2, null, null); return val3 == null ? val2 : val3; }
public Frame(String[] names, Vec[] vecs) { // assert names==null || names.length == vecs.length : "Number of columns does not match to // number of cols' names."; _names = names; _vecs = vecs; _keys = new Key[vecs.length]; for (int i = 0; i < vecs.length; i++) { Key k = _keys[i] = vecs[i]._key; if (DKV.get(k) == null) // If not already in KV, put it there DKV.put(k, vecs[i]); } Vec v0 = anyVec(); if (v0 == null) return; VectorGroup grp = v0.group(); for (int i = 0; i < vecs.length; i++) assert grp.equals(vecs[i].group()); }
public static String store2Hdfs(Key srcKey) { assert srcKey._kb[0] != Key.ARRAYLET_CHUNK; assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name Value v = DKV.get(srcKey); if (v == null) return "Key " + srcKey + " not found"; if (v._isArray == 0) { // Simple chunk? v.setHdfs(); // Set to HDFS and be done return null; // Success } // For ValueArrays, make the .hex header ValueArray ary = ValueArray.value(v); String err = PersistHdfs.freeze(srcKey, ary); if (err != null) return err; // The task managing which chunks to write next, // store in a known key TaskStore2HDFS ts = new TaskStore2HDFS(srcKey); Key selfKey = ts.selfKey(); UKV.put(selfKey, ts); // Then start writing chunks in-order with the zero chunk H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node(); RPC.call(ts.chunkHome(), ts); // Watch the progress key until it gets removed or an error appears long idx = 0; while (UKV.get(selfKey, ts) != null) { if (ts._indexFrom != idx) { System.out.print(" " + idx + "/" + ary.chunks()); idx = ts._indexFrom; } if (ts._err != null) { // Found an error? UKV.remove(selfKey); // Cleanup & report return ts._err; } try { Thread.sleep(100); } catch (InterruptedException e) { } } System.out.println(" " + ary.chunks() + "/" + ary.chunks()); // PersistHdfs.refreshHDFSKeys(); return null; }
/** * Performs deep clone of given model. * * <p>FIXME: fetch all data to the caller node */ protected M getModelDeepClone(M model) { M newModel = IcedUtils.clone(model, _dest); // Do not clone model metrics newModel._output._model_metrics = new Key[0]; newModel._output._training_metrics = null; newModel._output._validation_metrics = null; // Clone trees Key[][] treeKeys = newModel._output._treeKeys; for (int i = 0; i < treeKeys.length; i++) { for (int j = 0; j < treeKeys[i].length; j++) { if (treeKeys[i][j] == null) continue; ; CompressedTree ct = DKV.get(treeKeys[i][j]).get(); CompressedTree newCt = IcedUtils.clone(ct, CompressedTree.makeTreeKey(i, j), true); treeKeys[i][j] = newCt._key; } } return newModel; }
public Key importFile(int i, Futures fs) { if (_ok[i] < H2O.CLOUD.size()) return null; File f = new File(_files[i]); Key k; if (_newApi) { k = PersistNFS.decodeFile(f); NFSFileVec nfs = DKV.get(NFSFileVec.make(f, fs)).get(); UKV.put(k, new Frame(new String[] {"0"}, new Vec[] {nfs}), fs); } else { k = PersistNFS.decodeFile(f); long size = f.length(); Value val = (size < 2 * ValueArray.CHUNK_SZ) ? new Value(k, (int) size, Value.NFS) : new Value(k, new ValueArray(k, size), Value.NFS); val.setdsk(); UKV.put(k, val, fs); } return k; }
public ModelMetricsGLRM scoreMetricsOnly(Frame frame) { final int ncols = _output._names.length; // Need [A,X] where A = adapted test frame, X = loading frame // Note: A is adapted to original training frame Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); // Append loading frame X for calculating XY Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm); ModelMetrics mm = gs._mb.makeModelMetrics( GLRMModel.this, adaptedFr, null, null); // save error metrics based on imputed data return (ModelMetricsGLRM) mm; }
@Override public void map(Key key) { _rows = new long[_clusters.length]; _dist = new double[_clusters.length]; assert key.home(); ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); int rows = va.rpc(ValueArray.getChunkIndex(key)); double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); _rows[cd._cluster]++; _dist[cd._cluster] += cd._dist; } _arykey = null; _cols = null; _clusters = null; }
@Override public void parse(Key key) throws IOException { _firstRow = true; InputStream is = DKV.get(key).openStream(); try { _fs = new POIFSFileSystem(is); MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); _formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); request.addListenerForAllRecords(_formatListener); factory.processWorkbookEvents(request, _fs); } finally { try { is.close(); } catch (IOException e) { } } }
@SuppressWarnings("unused") // called through reflection by RequestServer public JobsV3 fetch(int version, JobsV3 s) { Key key = s.job_id.key(); Value val = DKV.get(key); if (null == val) throw new IllegalArgumentException("Job is missing"); Iced ice = val.get(); if (!(ice instanceof Job)) throw new IllegalArgumentException("Must be a Job not a " + ice.getClass()); Job j = (Job) ice; s.jobs = new JobV3[1]; // s.fillFromImpl(jobs); try { s.jobs[0] = (JobV3) Schema.schema(version, j).fillFromImpl(j); } // no special schema for this job subclass, so fall back to JobV3 catch (H2ONotFoundArgumentException e) { s.jobs[0] = new JobV3().fillFromImpl(j); } return s; }
// Convert a chunk# into a chunk - does lazy-chunk creation. As chunks are // asked-for the first time, we make the Key and an empty backing DVec. // Touching the DVec will force the file load. @Override public Value chunkIdx(int cidx) { final long nchk = nChunks(); assert 0 <= cidx && cidx < nchk; Key dkey = chunkKey(cidx); Value val1 = DKV.get(dkey); // Check for an existing one... will fetch data as needed if (val1 != null) return val1; // Found an existing one? // Lazily create a DVec for this chunk int len = (int) (cidx < nchk - 1 ? CHUNK_SZ : (_len - chunk2StartElem(cidx))); // DVec is just the raw file data with a null-compression scheme Value val2 = new Value(dkey, len, null, TypeMap.C1NCHUNK, _be); val2.setdsk(); // It is already on disk. // If not-home, then block till the Key is everywhere. Most calls here are // from the parser loading a text file, and the parser splits the work such // that most puts here are on home - so this is a simple speed optimization: // do not make a Futures nor block on it on home. Futures fs = dkey.home() ? null : new Futures(); // Atomically insert: fails on a race, but then return the old version Value val3 = DKV.DputIfMatch(dkey, val2, null, fs); if (!dkey.home() && fs != null) fs.blockForPending(); return val3 == null ? val2 : val3; }
static boolean checkSaneFrame_impl() { for (Key k : H2O.localKeySet()) { Value val = H2O.raw_get(k); if (val.isFrame()) { Frame fr = val.get(); Vec vecs[] = fr.vecs(); for (int i = 0; i < vecs.length; i++) { Vec v = vecs[i]; if (DKV.get(v._key) == null) { System.err.println( "Frame " + fr._key + " in the DKV, is missing Vec " + v._key + ", name=" + fr._names[i]); return false; } } } } return true; }