/** * Creates the value header based on the calculated columns. * * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed * dataset. */ private void createValueArrayHeader() { assert (_phase == Pass.TWO); Column[] cols = new Column[_ncolumns]; int off = 0; for (int i = 0; i < cols.length; ++i) { cols[i] = new Column(); cols[i]._n = _numRows - _invalidValues[i]; cols[i]._base = _bases[i]; assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i]) : "scale out of bounds!, col = " + i + ", scale = " + _scale[i]; cols[i]._scale = (char) pow10i(-_scale[i]); cols[i]._off = (char) off; cols[i]._size = (byte) COL_SIZES[_colTypes[i]]; cols[i]._domain = _colDomains[i]; cols[i]._max = _max[i]; cols[i]._min = _min[i]; cols[i]._mean = _mean[i]; cols[i]._sigma = _sigma[i]; cols[i]._name = _colNames[i]; off += Math.abs(cols[i]._size); } // let any pending progress reports finish DKV.write_barrier(); // finally make the value array header ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols); UKV.put(_resultKey, ary.value()); }
protected void testScalarExpression(String expr, double result) { Key key = executeExpression(expr); ValueArray va = ValueArray.value(key); assertEquals(va.numRows(), 1); assertEquals(va.numCols(), 1); assertEquals(result, va.datad(0, 0), 0.0); UKV.remove(key); }
protected void testKeyValues( Key k, double n1, double n2, double n3, double nx3, double nx2, double nx1) { ValueArray v = ValueArray.value(k); assertEquals(v.datad(0, 0), n1, 0.0); assertEquals(v.datad(1, 0), n2, 0.0); assertEquals(v.datad(2, 0), n3, 0.0); assertEquals(v.datad(v.numRows() - 3, 0), nx3, 0.0); assertEquals(v.datad(v.numRows() - 2, 0), nx2, 0.0); assertEquals(v.datad(v.numRows() - 1, 0), nx1, 0.0); }
/** * Executes the phase one of the parser. * * <p>First phase detects the encoding and basic statistics of the parsed dataset. * * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on * per chunk basis. * * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets. * * @throws Exception */ public void passOne(CsvParser.Setup setup) throws Exception { switch (_parserType) { case CSV: // precompute the parser setup, column setup and other settings byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024 if (setup == null) setup = CsvParser.guessCsvSetup(bits); if (setup._data == null) { _error = "Unable to determine the separator or number of columns on the dataset"; return; } _colNames = setup._data[0]; setColumnNames(_colNames); _skipFirstLine = setup._header; // set the separator this._sep = setup._separator; // if parsing value array, initialize the nrows array if (_sourceDataset._isArray != 0) { ValueArray ary = ValueArray.value(_sourceDataset); _nrows = new int[(int) ary.chunks()]; } // launch the distributed parser on its chunks. this.invoke(_sourceDataset._key); break; case XLS: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser p = new XlsParser(this); p.parse(_sourceDataset._key); --_myrows; // do not count the header break; case XLSX: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser px = new XlsxParser(this); px.parse(_sourceDataset._key); break; default: throw new Error("NOT IMPLEMENTED"); } // calculate proper numbers of rows for the chunks if (_nrows != null) { _numRows = 0; for (int i = 0; i < _nrows.length; ++i) { _numRows += _nrows[i]; _nrows[i] = _numRows; } } else { _numRows = _myrows; } // normalize mean for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]); }
public static String store2Hdfs(Key srcKey) { assert srcKey._kb[0] != Key.ARRAYLET_CHUNK; assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name Value v = DKV.get(srcKey); if (v == null) return "Key " + srcKey + " not found"; if (v._isArray == 0) { // Simple chunk? v.setHdfs(); // Set to HDFS and be done return null; // Success } // For ValueArrays, make the .hex header ValueArray ary = ValueArray.value(v); String err = PersistHdfs.freeze(srcKey, ary); if (err != null) return err; // The task managing which chunks to write next, // store in a known key TaskStore2HDFS ts = new TaskStore2HDFS(srcKey); Key selfKey = ts.selfKey(); UKV.put(selfKey, ts); // Then start writing chunks in-order with the zero chunk H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node(); RPC.call(ts.chunkHome(), ts); // Watch the progress key until it gets removed or an error appears long idx = 0; while (UKV.get(selfKey, ts) != null) { if (ts._indexFrom != idx) { System.out.print(" " + idx + "/" + ary.chunks()); idx = ts._indexFrom; } if (ts._err != null) { // Found an error? UKV.remove(selfKey); // Cleanup & report return ts._err; } try { Thread.sleep(100); } catch (InterruptedException e) { } } System.out.println(" " + ary.chunks() + "/" + ary.chunks()); // PersistHdfs.refreshHDFSKeys(); return null; }
@Override public void compute() { String path = null; // getPathFromValue(val); ValueArray ary = ValueArray.value(_arykey); Key self = selfKey(); while (_indexFrom < ary.chunks()) { Key ckey = ary.getChunkKey(_indexFrom++); if (!ckey.home()) { // Next chunk not At Home? RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk return; } Value val = DKV.get(ckey); // It IS home, so get the data _err = PersistHdfs.appendChunk(_arykey, val); if (_err != null) return; UKV.put(self, this); // Update the progress/self key } // We did the last chunk. Removing the selfKey is the signal to the web // thread that All Done. UKV.remove(self); }
public void testDataFrameStructure(Key k, int rows, int cols) { ValueArray v = ValueArray.value(k); assertEquals(v.numRows(), rows); assertEquals(v.numCols(), cols); }