public static String store2Hdfs(Key srcKey) { assert srcKey._kb[0] != Key.ARRAYLET_CHUNK; assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name Value v = DKV.get(srcKey); if (v == null) return "Key " + srcKey + " not found"; if (v._isArray == 0) { // Simple chunk? v.setHdfs(); // Set to HDFS and be done return null; // Success } // For ValueArrays, make the .hex header ValueArray ary = ValueArray.value(v); String err = PersistHdfs.freeze(srcKey, ary); if (err != null) return err; // The task managing which chunks to write next, // store in a known key TaskStore2HDFS ts = new TaskStore2HDFS(srcKey); Key selfKey = ts.selfKey(); UKV.put(selfKey, ts); // Then start writing chunks in-order with the zero chunk H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node(); RPC.call(ts.chunkHome(), ts); // Watch the progress key until it gets removed or an error appears long idx = 0; while (UKV.get(selfKey, ts) != null) { if (ts._indexFrom != idx) { System.out.print(" " + idx + "/" + ary.chunks()); idx = ts._indexFrom; } if (ts._err != null) { // Found an error? UKV.remove(selfKey); // Cleanup & report return ts._err; } try { Thread.sleep(100); } catch (InterruptedException e) { } } System.out.println(" " + ary.chunks() + "/" + ary.chunks()); // PersistHdfs.refreshHDFSKeys(); return null; }
// Test kaggle/creditsample-test data @org.junit.Test public void kaggle_credit() { Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz"); UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED")); UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED")); ValueArray val = DKV.get(okey).get(); // Check parsed dataset final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20]; assertEquals("Number of chunks", n, val.chunks()); assertEquals("Number of rows", 150000, val.numRows()); assertEquals("Number of cols", 12, val.numCols()); // setup default values for DRF int ntrees = 3; int depth = 30; int gini = StatType.GINI.ordinal(); int seed = 42; StatType statType = StatType.values()[gini]; final int cols[] = new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1 // Start the distributed Random Forest final Key modelKey = Key.make("model"); DRFJob result = hex.rf.DRF.execute( modelKey, cols, val, ntrees, depth, 1024, statType, seed, true, null, -1, Sampling.Strategy.RANDOM, 1.0f, null, 0, 0, false); // Wait for completion on all nodes RFModel model = result.get(); assertEquals("Number of classes", 2, model.classes()); assertEquals("Number of trees", ntrees, model.size()); model.deleteKeys(); UKV.remove(modelKey); UKV.remove(okey); }
/** * Executes the phase one of the parser. * * <p>First phase detects the encoding and basic statistics of the parsed dataset. * * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on * per chunk basis. * * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets. * * @throws Exception */ public void passOne(CsvParser.Setup setup) throws Exception { switch (_parserType) { case CSV: // precompute the parser setup, column setup and other settings byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024 if (setup == null) setup = CsvParser.guessCsvSetup(bits); if (setup._data == null) { _error = "Unable to determine the separator or number of columns on the dataset"; return; } _colNames = setup._data[0]; setColumnNames(_colNames); _skipFirstLine = setup._header; // set the separator this._sep = setup._separator; // if parsing value array, initialize the nrows array if (_sourceDataset._isArray != 0) { ValueArray ary = ValueArray.value(_sourceDataset); _nrows = new int[(int) ary.chunks()]; } // launch the distributed parser on its chunks. this.invoke(_sourceDataset._key); break; case XLS: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser p = new XlsParser(this); p.parse(_sourceDataset._key); --_myrows; // do not count the header break; case XLSX: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser px = new XlsxParser(this); px.parse(_sourceDataset._key); break; default: throw new Error("NOT IMPLEMENTED"); } // calculate proper numbers of rows for the chunks if (_nrows != null) { _numRows = 0; for (int i = 0; i < _nrows.length; ++i) { _numRows += _nrows[i]; _nrows[i] = _numRows; } } else { _numRows = _myrows; } // normalize mean for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]); }
public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) { final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest); new ValueArray(dest, 0).delete_and_lock(job.self()); final H2OCountedCompleter fjtask = new H2OCountedCompleter() { @Override public void compute2() { KMeansApply kms = new KMeansApply(); kms._job = job; kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); Column c = new Column(); c._name = Constants.RESPONSE; c._size = ROW_SIZE; c._scale = 1; c._min = 0; c._max = model._clusters.length; c._mean = Double.NaN; c._sigma = Double.NaN; c._domain = null; c._n = ary.numRows(); ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c}); res.unlock(job.self()); job.remove(); tryComplete(); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { job.onException(ex); return super.onExceptionalCompletion(ex, caller); } }; job.start(fjtask); H2O.submitTask(fjtask); return job; }
@Override public void compute() { String path = null; // getPathFromValue(val); ValueArray ary = ValueArray.value(_arykey); Key self = selfKey(); while (_indexFrom < ary.chunks()) { Key ckey = ary.getChunkKey(_indexFrom++); if (!ckey.home()) { // Next chunk not At Home? RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk return; } Value val = DKV.get(ckey); // It IS home, so get the data _err = PersistHdfs.appendChunk(_arykey, val); if (_err != null) return; UKV.put(self, this); // Update the progress/self key } // We did the last chunk. Removing the selfKey is the signal to the web // thread that All Done. UKV.remove(self); }