@Override protected void execImpl() { Vec va = null, vp = null, avp = null; try { if (classification) { // Create a new vectors - it is cheap since vector are only adaptation vectors va = vactual.toEnum(); // always returns TransfVec actual_domain = va._domain; vp = vpredict.toEnum(); // always returns TransfVec predicted_domain = vp._domain; if (!Arrays.equals(actual_domain, predicted_domain)) { domain = Utils.domainUnion(actual_domain, predicted_domain); int[][] vamap = Model.getDomainMapping(domain, actual_domain, true); va = TransfVec.compose((TransfVec) va, vamap, domain, false); // delete original va int[][] vpmap = Model.getDomainMapping(domain, predicted_domain, true); vp = TransfVec.compose((TransfVec) vp, vpmap, domain, false); // delete original vp } else domain = actual_domain; // The vectors are from different groups => align them, but properly delete it after // computation if (!va.group().equals(vp.group())) { avp = vp; vp = va.align(vp); } cm = new CM(domain.length).doAll(va, vp)._cm; } else { mse = new CM(1).doAll(vactual, vpredict).mse(); } return; } finally { // Delete adaptation vectors if (va != null) UKV.remove(va._key); if (vp != null) UKV.remove(vp._key); if (avp != null) UKV.remove(avp._key); } }
// Adapt a trained model to a test dataset with different enums /*@Test*/ public void testModelAdapt() { File file1 = TestUtil.find_test_file("./smalldata/kaggle/KDDTrain.arff.gz"); Key fkey1 = NFSFileVec.make(file1); Key dest1 = Key.make("KDDTrain.hex"); File file2 = TestUtil.find_test_file("./smalldata/kaggle/KDDTest.arff.gz"); Key fkey2 = NFSFileVec.make(file2); Key dest2 = Key.make("KDDTest.hex"); GBM gbm = null; Frame fr = null; try { gbm = new GBM(); gbm.source = ParseDataset2.parse(dest1, new Key[] {fkey1}); UKV.remove(fkey1); gbm.response = gbm.source.remove(41); // Response is col 41 gbm.ntrees = 2; gbm.max_depth = 8; gbm.learn_rate = 0.2f; gbm.min_rows = 10; gbm.nbins = 50; gbm.invoke(); // The test data set has a few more enums than the train Frame ftest = ParseDataset2.parse(dest2, new Key[] {fkey2}); Frame preds = gbm.score(ftest); } finally { UKV.remove(dest1); // Remove original hex frame key if (gbm != null) { UKV.remove(gbm.dest()); // Remove the model UKV.remove(gbm.response._key); gbm.remove(); // Remove GBM Job if (fr != null) fr.remove(); } } }
/** Returns a list of all jobs in a system. * @return list of all jobs including running, done, cancelled, crashed jobs. */ public static Job[] all() { List list = UKV.get(LIST); Job[] jobs = new Job[list==null?0:list._jobs.length]; int j=0; for( int i=0; i<jobs.length; i++ ) { Job job = UKV.get(list._jobs[i]); if( job != null ) jobs[j++] = job; } if( j<jobs.length ) jobs = Arrays.copyOf(jobs,j); return jobs; }
@Test public void testFullVectAssignment() { Key k = loadAndParseKey("cars.hex", "smalldata/cars.csv"); Key k2 = executeExpression("cars.hex"); testDataFrameStructure(k2, 406, 8); UKV.remove(k2); k2 = executeExpression("a5 = cars.hex[2]"); testVectorExpression("a5", 8, 8, 8, 4, 6, 6); UKV.remove(k2); UKV.remove(k); UKV.remove(Key.make("a5")); }
// Test kaggle/creditsample-test data @org.junit.Test public void kaggle_credit() { Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz"); UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED")); UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED")); ValueArray val = DKV.get(okey).get(); // Check parsed dataset final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20]; assertEquals("Number of chunks", n, val.chunks()); assertEquals("Number of rows", 150000, val.numRows()); assertEquals("Number of cols", 12, val.numCols()); // setup default values for DRF int ntrees = 3; int depth = 30; int gini = StatType.GINI.ordinal(); int seed = 42; StatType statType = StatType.values()[gini]; final int cols[] = new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1 // Start the distributed Random Forest final Key modelKey = Key.make("model"); DRFJob result = hex.rf.DRF.execute( modelKey, cols, val, ntrees, depth, 1024, statType, seed, true, null, -1, Sampling.Strategy.RANDOM, 1.0f, null, 0, 0, false); // Wait for completion on all nodes RFModel model = result.get(); assertEquals("Number of classes", 2, model.classes()); assertEquals("Number of trees", ntrees, model.size()); model.deleteKeys(); UKV.remove(modelKey); UKV.remove(okey); }
@Test public void testColumnSelectors() { Key k = loadAndParseKey("cars.hex", "smalldata/cars.csv"); Key k2 = executeExpression("cars.hex[2]"); testDataFrameStructure(k2, 406, 1); testKeyValues(k2, 8, 8, 8, 4, 6, 6); UKV.remove(k2); k2 = executeExpression("cars.hex$year"); testDataFrameStructure(k2, 406, 1); testKeyValues(k2, 73, 70, 72, 76, 78, 81); UKV.remove(k2); UKV.remove(k); }
/*@org.junit.Test*/ public void covtype() { // Key okey = loadAndParseFile("covtype.hex", "smalldata/covtype/covtype.20k.data"); // Key okey = loadAndParseFile("covtype.hex", "../datasets/UCI/UCI-large/covtype/covtype.data"); // Key okey = loadAndParseFile("covtype.hex", "/home/0xdiag/datasets/standard/covtype.data"); Key okey = loadAndParseFile("mnist.hex", "smalldata/mnist/mnist8m.10k.csv.gz"); // Key okey = loadAndParseFile("mnist.hex", "/home/0xdiag/datasets/mnist/mnist8m.csv"); ValueArray val = UKV.get(okey); // setup default values for DRF int ntrees = 8; int depth = 999; int gini = StatType.ENTROPY.ordinal(); int seed = 42; StatType statType = StatType.values()[gini]; final int cols[] = new int[val.numCols()]; for (int i = 1; i < cols.length; i++) cols[i] = i - 1; cols[cols.length - 1] = 0; // Class is in column 0 for mnist // Start the distributed Random Forest final Key modelKey = Key.make("model"); DRFJob result = hex.rf.DRF.execute( modelKey, cols, val, ntrees, depth, 1024, statType, seed, true, null, -1, Sampling.Strategy.RANDOM, 1.0f, null, 0, 0, false); // Wait for completion on all nodes RFModel model = result.get(); assertEquals("Number of classes", 10, model.classes()); assertEquals("Number of trees", ntrees, model.size()); model.deleteKeys(); UKV.remove(modelKey); UKV.remove(okey); }
/** * Creates the value header based on the calculated columns. * * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed * dataset. */ private void createValueArrayHeader() { assert (_phase == Pass.TWO); Column[] cols = new Column[_ncolumns]; int off = 0; for (int i = 0; i < cols.length; ++i) { cols[i] = new Column(); cols[i]._n = _numRows - _invalidValues[i]; cols[i]._base = _bases[i]; assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i]) : "scale out of bounds!, col = " + i + ", scale = " + _scale[i]; cols[i]._scale = (char) pow10i(-_scale[i]); cols[i]._off = (char) off; cols[i]._size = (byte) COL_SIZES[_colTypes[i]]; cols[i]._domain = _colDomains[i]; cols[i]._max = _max[i]; cols[i]._min = _min[i]; cols[i]._mean = _mean[i]; cols[i]._sigma = _sigma[i]; cols[i]._name = _colNames[i]; off += Math.abs(cols[i]._size); } // let any pending progress reports finish DKV.write_barrier(); // finally make the value array header ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols); UKV.put(_resultKey, ary.value()); }
@Test public void testDifferentSizeOps() { Key cars = loadAndParseKey("cars.hex", "smalldata/cars.csv"); Key poker = loadAndParseKey("p.hex", "smalldata/poker/poker-hand-testing.data"); testVectorExpression("cars.hex$year + p.hex[1]", 74, 82, 81, 84, 86, 81); testVectorExpression("cars.hex$year - p.hex[1]", 72, 58, 63, 62, 64, 71); testVectorExpression("cars.hex$year * p.hex[1]", 73, 840, 648, 803, 825, 380); // testVectorExpression("cars.hex$year / p.hex[1]", 73, 70/12, 8, 76/11, 78/11, 15.2); // hard // to get the numbers right + not needed no new coverage testVectorExpression("p.hex[1] + cars.hex$year", 74, 82, 81, 84, 86, 81); testVectorExpression("p.hex[1] - cars.hex$year", -72, -58, -63, -62, -64, -71); testVectorExpression("p.hex[1] * cars.hex$year", 73, 840, 648, 803, 825, 380); // testVectorExpression("p.hex[1] / cars.hex$year", 1/73, 12/70, 0.125, 11/76, 11/78, 5/81); UKV.remove(poker); UKV.remove(cars); }
// Write-lock & delete 'k'. Will fail if 'k' is locked by anybody other than 'job_key' public static void delete(Key k, Key job_key) { if (k == null) return; Value val = DKV.get(k); if (val == null) return; // Or just nothing there to delete if (!val.isLockable()) UKV.remove(k); // Simple things being deleted else ((Lockable) val.get()).delete(job_key, 0.0f); // Lockable being deleted }
public void set(Argument arg, String input, Object value) { if (arg._field.getType() != Key.class && value instanceof Key) value = UKV.get((Key) value); try { if (arg._field.getType() == Key.class && value instanceof ValueArray) value = ((ValueArray) value)._key; // else if (arg._field.getType() == int.class && value instanceof Long) value = ((Long) value).intValue(); // else if (arg._field.getType() == float.class && value instanceof Double) value = ((Double) value).floatValue(); // else if (arg._field.getType() == Frame.class && value instanceof ValueArray) value = ((ValueArray) value).asFrame(input); // else if (value instanceof NumberSequence) { double[] ds = ((NumberSequence) value)._arr; if (arg._field.getType() == int[].class) { int[] is = new int[ds.length]; for (int i = 0; i < is.length; i++) is[i] = (int) ds[i]; value = is; } else value = ds; } arg._field.set(this, value); } catch (Exception e) { throw new RuntimeException(e); } }
/** Actually remove/delete all Vecs from memory, not just from the Frame. */ public void remove(Futures fs) { if (vecs().length > 0) { for (Vec v : _vecs) UKV.remove(v._key, fs); } _names = new String[0]; _vecs = new Vec[0]; _keys = new Key[0]; }
protected void testScalarExpression(String expr, double result) { Key key = executeExpression(expr); ValueArray va = ValueArray.value(key); assertEquals(va.numRows(), 1); assertEquals(va.numCols(), 1); assertEquals(result, va.datad(0, 0), 0.0); UKV.remove(key); }
public static ValueArray loadAndParseKey(Key okey, String path) { FileIntegrityChecker c = FileIntegrityChecker.check(new File(path),false); Key k = c.syncDirectory(null,null,null,null); ParseDataset.forkParseDataset(okey, new Key[] { k }, null).get(); UKV.remove(k); ValueArray res = DKV.get(okey).get(); return res; }
@Override public boolean toHTML(StringBuilder sb) { Job jjob = Job.findJob(job_key); DRFModel m = UKV.get(jjob.dest()); if (m != null) m.generateHTML("DRF Model", sb); else DocGen.HTML.paragraph(sb, "Pending..."); return true; }
public static String store2Hdfs(Key srcKey) { assert srcKey._kb[0] != Key.ARRAYLET_CHUNK; assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name Value v = DKV.get(srcKey); if (v == null) return "Key " + srcKey + " not found"; if (v._isArray == 0) { // Simple chunk? v.setHdfs(); // Set to HDFS and be done return null; // Success } // For ValueArrays, make the .hex header ValueArray ary = ValueArray.value(v); String err = PersistHdfs.freeze(srcKey, ary); if (err != null) return err; // The task managing which chunks to write next, // store in a known key TaskStore2HDFS ts = new TaskStore2HDFS(srcKey); Key selfKey = ts.selfKey(); UKV.put(selfKey, ts); // Then start writing chunks in-order with the zero chunk H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node(); RPC.call(ts.chunkHome(), ts); // Watch the progress key until it gets removed or an error appears long idx = 0; while (UKV.get(selfKey, ts) != null) { if (ts._indexFrom != idx) { System.out.print(" " + idx + "/" + ary.chunks()); idx = ts._indexFrom; } if (ts._err != null) { // Found an error? UKV.remove(selfKey); // Cleanup & report return ts._err; } try { Thread.sleep(100); } catch (InterruptedException e) { } } System.out.println(" " + ary.chunks() + "/" + ary.chunks()); // PersistHdfs.refreshHDFSKeys(); return null; }
private DRFModel runDRF(Frame data, PrepData dprep) { DRF drf = new DRF(); drf.source = data; drf.response = dprep.prep(data); drf.ntrees = 1; drf.invoke(); return UKV.get(drf.dest()); }
@Override protected Response serve() { Response response = super.serve(); if (destination_key != null) { GridSearch grid = UKV.get(destination_key); if (grid != null) jobs = grid.jobs; } return response; }
@Test public void testLargeDataOps() { Key poker = loadAndParseKey("p.hex", "smalldata/poker/poker-hand-testing.data"); testVectorExpression("p.hex[1] + p.hex[2]", 2, 15, 13, 15, 12, 7); testVectorExpression("p.hex[1] - p.hex[2]", 0, 9, 5, 7, 10, 3); testVectorExpression("p.hex[1] * p.hex[2]", 1, 36, 36, 44, 11, 10); testVectorExpression("p.hex[1] / p.hex[2]", 1.0, 4.0, 2.25, 2.75, 11.0, 2.5); UKV.remove(poker); }
@Test public void testVectorOperators() { Key k = loadAndParseKey("cars.hex", "smalldata/cars.csv"); testVectorExpression("cars.hex[2] + cars.hex$year", 81, 78, 80, 80, 84, 87); testVectorExpression("cars.hex[2] - cars.hex$year", -65, -62, -64, -72, -72, -75); testVectorExpression("cars.hex[2] * cars.hex$year", 584, 560, 576, 304, 468, 486); testVectorExpression("cars.hex$year / cars.hex[2]", 9.125, 8.75, 9.0, 19.0, 13.0, 13.5); UKV.remove(k); }
static final int findResponseIdx(RFModel model) { String nresponse = model.responseName(); ValueArray ary = UKV.get(model._dataKey); int idx = 0; for (ValueArray.Column cols : ary._cols) if (nresponse.equals(cols._name)) return idx; else idx++; return -1; }
/** Actually remove/delete all Vecs from memory, not just from the Frame. */ public void remove(Futures fs) { if (_vecs.length > 0) { VectorGroup vg = _vecs[0].group(); for (Vec v : _vecs) UKV.remove(v._key, fs); DKV.remove(vg._key); } _names = new String[0]; _vecs = new Vec[0]; }
// ========================================================================== public void basicGBM(String fname, String hexname, PrepData prep) { File file = TestUtil.find_test_file(fname); if (file == null) return; // Silently abort test if the file is missing Key fkey = NFSFileVec.make(file); Key dest = Key.make(hexname); GBM gbm = null; Frame fr = null; try { gbm = new GBM(); gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey}); UKV.remove(fkey); int idx = prep.prep(fr); if (idx < 0) { gbm.classification = false; idx = ~idx; } String rname = fr._names[idx]; gbm.response = fr.vecs()[idx]; fr.remove(idx); // Move response to the end fr.add(rname, gbm.response); gbm.ntrees = 4; gbm.max_depth = 4; gbm.min_rows = 1; gbm.nbins = 50; gbm.cols = new int[fr.numCols()]; for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i; gbm.learn_rate = .2f; gbm.invoke(); fr = gbm.score(gbm.source); GBM.GBMModel gbmmodel = UKV.get(gbm.dest()); // System.out.println(gbmmodel.toJava()); } finally { UKV.remove(dest); // Remove original hex frame key if (gbm != null) { UKV.remove(gbm.dest()); // Remove the model UKV.remove(gbm.response._key); gbm.remove(); // Remove GBM Job if (fr != null) fr.remove(); } } }
public void onException(Throwable ex) { UKV.remove(dest()); Value v = DKV.get(progressKey()); if( v != null ) { ChunkProgress p = v.get(); p = p.error(ex.getMessage()); DKV.put(progressKey(), p); } cancel(ex); }
// Block until the Job finishes. // NOT F/J FRIENDLY, EATS THE THREAD until job completes. Only use for web threads. public <T> T get() { // TODO through notifications? while (DKV.get(_self) != null) { try { Thread.sleep(10); } catch (InterruptedException e) { throw new RuntimeException(e); } } return (T) UKV.get(_dest); }
public Key importFile(int i, Futures fs) { if (_ok[i] < H2O.CLOUD.size()) return null; File f = new File(_files[i]); Key k; if (_newApi) { k = PersistNFS.decodeFile(f); NFSFileVec nfs = DKV.get(NFSFileVec.make(f, fs)).get(); UKV.put(k, new Frame(new String[] {"0"}, new Vec[] {nfs}), fs); } else { k = PersistNFS.decodeFile(f); long size = f.length(); Value val = (size < 2 * ValueArray.CHUNK_SZ) ? new Value(k, (int) size, Value.NFS) : new Value(k, new ValueArray(k, size), Value.NFS); val.setdsk(); UKV.put(k, val, fs); } return k; }
@Override public void compute() { String path = null; // getPathFromValue(val); ValueArray ary = ValueArray.value(_arykey); Key self = selfKey(); while (_indexFrom < ary.chunks()) { Key ckey = ary.getChunkKey(_indexFrom++); if (!ckey.home()) { // Next chunk not At Home? RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk return; } Value val = DKV.get(ckey); // It IS home, so get the data _err = PersistHdfs.appendChunk(_arykey, val); if (_err != null) return; UKV.put(self, this); // Update the progress/self key } // We did the last chunk. Removing the selfKey is the signal to the web // thread that All Done. UKV.remove(self); }
protected String[] getVectorDomain(final Vec v) { assert v==null || v.isInt() || v.isEnum() : "Cannot get vector domain!"; if (v==null) return null; String[] r = null; if (v.isEnum()) { r = v.domain(); } else { Vec tmp = v.toEnum(); r = tmp.domain(); UKV.remove(tmp._key); } return r; }
@Test public void testBigLargeExpression() { Key poker = loadAndParseKey("p.hex", "smalldata/poker/poker-hand-testing.data"); testVectorExpression( "p.hex[1] / p.hex[2] + p.hex[3] * p.hex[1] - p.hex[5] + (2* p.hex[1] - (p.hex[2]+3))", 8, 35, 63.25, 85.75, 116.0, 43.5); UKV.remove(poker); }
@Override protected void execImpl() { Vec va = null, vp; try { va = vactual.toEnum(); // always returns TransfVec vp = vpredict; // The vectors are from different groups => align them, but properly delete it after // computation if (!va.group().equals(vp.group())) { vp = va.align(vp); } // compute thresholds, if not user-given if (thresholds != null) { sort(thresholds); if (Utils.minValue(thresholds) < 0) throw new IllegalArgumentException("Minimum threshold cannot be negative."); if (Utils.maxValue(thresholds) > 1) throw new IllegalArgumentException("Maximum threshold cannot be greater than 1."); } else { HashSet hs = new HashSet(); final int bins = (int) Math.min(vpredict.length(), 200l); final long stride = Math.max(vpredict.length() / bins, 1); for (int i = 0; i < bins; ++i) hs.add( new Float( vpredict.at(i * stride))); // data-driven thresholds TODO: use percentiles (from // Summary2?) for (int i = 0; i < 51; ++i) hs.add(new Float(i / 50.)); // always add 0.02-spaced thresholds from 0 to 1 // created sorted vector of unique thresholds thresholds = new float[hs.size()]; int i = 0; for (Object h : hs) { thresholds[i++] = (Float) h; } sort(thresholds); } // compute CMs aucdata = new AUCData() .compute( new AUCTask(thresholds, va.mean()).doAll(va, vp).getCMs(), thresholds, va._domain, threshold_criterion); } finally { // Delete adaptation vectors if (va != null) UKV.remove(va._key); } }