@Override public Value lazyArrayChunk(final Key key) { final Key arykey = ValueArray.getArrayKey(key); // From the base file key final long off = (_iceRoot != null) ? 0 : ValueArray.getChunkOffset(key); // The offset final Path p = (_iceRoot != null) ? new Path(_iceRoot, getIceName(key, (byte) 'V')) : new Path(arykey.toString()); final Size sz = new Size(); run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); long rem = fs.getFileStatus(p).getLen() - off; sz._value = (rem > ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem; return null; } }, true, 0); Value val = new Value(key, sz._value, Value.HDFS); val.setdsk(); // But its already on disk. return val; }
public static Response redirect(JsonObject fromPageResponse, Key rfModelKey) { RFModel rfModel = DKV.get(rfModelKey).get(); ValueArray data = DKV.get(rfModel._dataKey).get(); return redirect( fromPageResponse, null, rfModelKey, rfModel._dataKey, rfModel._totalTrees, data.numCols() - 1, null, true, false); }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; final Path p; if (_iceRoot != null) { p = new Path(_iceRoot, getIceName(v)); } else { // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key if (k.toString().endsWith(Extensions.HEX)) { // Hex file? int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header? skip += value_len; } } p = new Path(k.toString()); } final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
public static KMeansScore score(KMeansModel model, ValueArray ary) { KMeansScore kms = new KMeansScore(); kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); return kms; }
@Override public void map(Key key) { _rows = new long[_clusters.length]; _dist = new double[_clusters.length]; assert key.home(); ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); int rows = va.rpc(ValueArray.getChunkIndex(key)); double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); _rows[cd._cluster]++; _dist[cd._cluster] += cd._dist; } _arykey = null; _cols = null; _clusters = null; }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key } else if (k._kb[0] == Key.DVEC) { skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset } final Path p = _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v)); final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
private void updateClusters( int[] clusters, int count, long chunk, long numrows, int rpc, long updatedRow) { final int offset = (int) (updatedRow - (rpc * chunk)); final Key chunkKey = ValueArray.getChunkKey(chunk, _job.dest()); final int[] message; if (count == clusters.length) message = clusters; else { message = new int[count]; System.arraycopy(clusters, 0, message, 0, message.length); } final int rows = ValueArray.rpc(chunk, rpc, numrows); new Atomic() { @Override public Value atomic(Value val) { assert val == null || val._key.equals(chunkKey); AutoBuffer b = new AutoBuffer(rows * ROW_SIZE); if (val != null) b._bb.put(val.memOrLoad()); for (int i = 0; i < message.length; i++) b.put4((offset + i) * 4, message[i]); b.position(b.limit()); return new Value(chunkKey, b.buf()); } }.invoke(chunkKey); }
public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) { final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest); new ValueArray(dest, 0).delete_and_lock(job.self()); final H2OCountedCompleter fjtask = new H2OCountedCompleter() { @Override public void compute2() { KMeansApply kms = new KMeansApply(); kms._job = job; kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); Column c = new Column(); c._name = Constants.RESPONSE; c._size = ROW_SIZE; c._scale = 1; c._min = 0; c._max = model._clusters.length; c._mean = Double.NaN; c._sigma = Double.NaN; c._domain = null; c._n = ary.numRows(); ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c}); res.unlock(job.self()); job.remove(); tryComplete(); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { job.onException(ex); return super.onExceptionalCompletion(ex, caller); } }; job.start(fjtask); H2O.submitTask(fjtask); return job; }
/** * Creates a new ValueArray with classes. New ValueArray is not aligned with source one * unfortunately so have to send results to each chunk owner using Atomic. */ @Override public void map(Key key) { assert key.home(); if (Job.isRunning(_job.self())) { ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); long startRow = va.startRow(ValueArray.getChunkIndex(key)); int rows = va.rpc(ValueArray.getChunkIndex(key)); int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE); long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE); long updatedChk = chunk; long updatedRow = startRow; double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); int[] clusters = new int[rows]; int count = 0; for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE); if (chunk != updatedChk) { updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow); updatedChk = chunk; updatedRow = startRow + row; count = 0; } clusters[count++] = cd._cluster; } if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow); _job.updateProgress(1); } _job = null; _arykey = null; _cols = null; _clusters = null; }