public static ValueArray loadAndParseKey(Key okey, String path) { FileIntegrityChecker c = FileIntegrityChecker.check(new File(path),false); Key k = c.syncDirectory(null,null,null,null); ParseDataset.forkParseDataset(okey, new Key[] { k }, null).get(); UKV.remove(k); ValueArray res = DKV.get(okey).get(); return res; }
protected Frame parse_test_file(Key outputKey, String fname, boolean guessSetup) { File f = find_test_file(fname); assert f != null && f.exists() : " file not found: " + fname; NFSFileVec nfs = NFSFileVec.make(f); return ParseDataset.parse( outputKey, new Key[] {nfs._key}, true, ParseSetup.guessSetup(new Key[] {nfs._key}, false, 1)); }
/** * Find & parse a folder of CSV files. NPE if file not found. * * @param fname Test filename * @return Frame or NPE */ protected Frame parse_test_folder(String fname) { File folder = find_test_file(fname); assert folder.isDirectory(); File[] files = folder.listFiles(); Arrays.sort(files); ArrayList<Key> keys = new ArrayList<Key>(); for (File f : files) if (f.isFile()) keys.add(NFSFileVec.make(f)._key); Key[] res = new Key[keys.size()]; keys.toArray(res); return ParseDataset.parse(Key.make(), res); }
@Test public void testChicago() { Frame weather = null, crimes = null, census = null; String oldtz = Exec.exec("(getTimeZone)").getStr(); try { weather = parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv"); crimes = parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip"); String fname = "smalldata/chicago/chicagoCensus.csv"; File f = find_test_file(fname); assert f != null && f.exists() : " file not found: " + fname; NFSFileVec nfs = NFSFileVec.make(f); ParseSetup ps = ParseSetup.guessSetup(new Key[] {nfs._key}, false, 1); ps.getColumnTypes()[1] = Vec.T_ENUM; census = ParseDataset.parse(Key.make("census.hex"), new Key[] {nfs._key}, true, ps); census = exec_str( "(colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area.Number\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"])", "census.hex"); crimes = exec_str( "(colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"])", "crimes.hex"); exec_str("(setTimeZone \"Etc/UTC\")", null); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (as.Date (cols crimes.hex [2]) \"%m/%d/%Y %I:%M:%S %p\")))) [22] [0:9999]) 22 \"Day\")", "crimes.hex"); crimes = exec_str( "(colnames= (= crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) #1)) [23] [0:9999]) 23 \"Month\")", "crimes.hex"); Keyed.remove(Key.make("nary_op_30")); crimes = exec_str( "(colnames= (= crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) #1900)) #1900)) [17] [0:9999]) 17 \"Year\")", "crimes.hex"); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_10 (week nary_op_5)) [24] [0:9999]) 24 \"WeekNum\")", "crimes.hex"); Keyed.remove(Key.make("binary_op_32")); Keyed.remove(Key.make("binary_op_31")); Keyed.remove(Key.make("unary_op_8")); checkSaneFrame(); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) [25] [0:9999]) 25 \"WeekDay\")", "crimes.hex"); Keyed.remove( Key.make( "nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-3\\smalldata\\chicago\\chicagoCrimes10k.csv.zip")); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_12 (hour nary_op_5)) [26] [0:9999]) 26 \"HourOfDay\")", "crimes.hex"); crimes = exec_str( "(colnames= (= crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) [27] [0:9999]) 27 \"Weekend\")", "crimes.hex"); // Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag crimes = exec_str( "(colnames= (= crimes.hex nary_op_16 [28] [0:9999]) 28 \"Season\")", "crimes.hex"); // Standard "head of 10 rows" pattern for printing Frame subset_33 = exec_str("(rows crimes.hex [0:10])", "subset_33"); Keyed.remove(Key.make("subset_33")); Keyed.remove(Key.make("subset_33")); Keyed.remove(Key.make("unary_op_29")); Keyed.remove(Key.make("nary_op_28")); Keyed.remove(Key.make("nary_op_27")); Keyed.remove(Key.make("nary_op_26")); Keyed.remove(Key.make("binary_op_25")); Keyed.remove(Key.make("binary_op_24")); Keyed.remove(Key.make("binary_op_23")); Keyed.remove(Key.make("binary_op_22")); Keyed.remove(Key.make("binary_op_21")); Keyed.remove(Key.make("binary_op_20")); Keyed.remove(Key.make("binary_op_19")); Keyed.remove(Key.make("binary_op_18")); Keyed.remove(Key.make("binary_op_17")); Keyed.remove(Key.make("nary_op_16")); Keyed.remove(Key.make("binary_op_15")); Keyed.remove(Key.make("binary_op_14")); Keyed.remove(Key.make("binary_op_13")); Keyed.remove(Key.make("unary_op_12")); Keyed.remove(Key.make("unary_op_11")); Keyed.remove(Key.make("unary_op_10")); Keyed.remove(Key.make("binary_op_9")); Keyed.remove(Key.make("unary_op_8")); Keyed.remove(Key.make("unary_op_7")); Keyed.remove(Key.make("unary_op_6")); Keyed.remove(Key.make("nary_op_5")); checkSaneFrame(); // Standard "head of 10 rows" pattern for printing Frame subset_34 = exec_str("(rows crimes.hex [0:10])", "subset_34"); Keyed.remove(Key.make("subset_34")); census = exec_str( "(colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"])", "census.hex"); Keyed.remove(Key.make("subset_34")); Frame subset_35 = exec_str("(cols crimes.hex [-3])", "subset_35"); Frame subset_36 = exec_str("(cols weather.hex [-1])", "subset_36"); subset_36 = exec_str( "(colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"])", "subset_36"); crimes.remove(); weather.remove(); // nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared Frame nary_op_37 = exec_str("(merge subset_35 census.hex FALSE FALSE)", "nary_op_37"); // nary_op_38 = merge( nary_op_37 subset_36); Vecs in nary_op_38 and nary_pop_37 and X shared Frame subset_41 = exec_str( "(rows (tmp= nary_op_38 (merge nary_op_37 subset_36 TRUE FALSE)) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) #0.8)))", "subset_41"); // Standard "head of 10 rows" pattern for printing Frame subset_44 = exec_str("(rows subset_41 [0:10])", "subset_44"); Keyed.remove(Key.make("subset_44")); Keyed.remove(Key.make("subset_44")); Keyed.remove(Key.make("binary_op_40")); Keyed.remove(Key.make("nary_op_37")); Frame subset_43 = exec_str("(rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 #0.8)))", "subset_43"); // Chicago demo continues on past, but this is all I've captured for now checkSaneFrame(); } finally { Exec.exec( "(setTimeZone \"" + oldtz + "\")"); // Restore time zone (which is global, and will affect following tests) if (weather != null) weather.remove(); if (crimes != null) crimes.remove(); if (census != null) census.remove(); for (String s : new String[] { "nary_op_5", "unary_op_6", "unary_op_7", "unary_op_8", "binary_op_9", "unary_op_10", "unary_op_11", "unary_op_12", "binary_op_13", "binary_op_14", "binary_op_15", "nary_op_16", "binary_op_17", "binary_op_18", "binary_op_19", "binary_op_20", "binary_op_21", "binary_op_22", "binary_op_23", "binary_op_24", "binary_op_25", "nary_op_26", "nary_op_27", "nary_op_28", "unary_op_29", "binary_op_30", "binary_op_31", "binary_op_32", "subset_33", "subset_34", "subset_35", "subset_36", "nary_op_37", "nary_op_38", "nary_op_39", "binary_op_40", "subset_41", "binary_op_42", "subset_43", "subset_44", }) Keyed.remove(Key.make(s)); } }
public static Frame parse_test_file(Key outputKey, String fname) { File f = find_test_file_static(fname); assert f != null && f.exists() : " file not found: " + fname; NFSFileVec nfs = NFSFileVec.make(f); return ParseDataset.parse(outputKey, nfs._key); }
@Test @Ignore public void run() { Scope.enter(); try { File file = find_test_file("bigdata/laptop/mnist/train.csv.gz"); File valid = find_test_file("bigdata/laptop/mnist/test.csv.gz"); if (file != null) { NFSFileVec trainfv = NFSFileVec.make(file); Frame frame = ParseDataset.parse(Key.make(), trainfv._key); NFSFileVec validfv = NFSFileVec.make(valid); Frame vframe = ParseDataset.parse(Key.make(), validfv._key); DeepLearningParameters p = new DeepLearningParameters(); // populate model parameters p._model_id = Key.make("dl_mnist_model"); p._train = frame._key; // p._valid = vframe._key; p._response_column = "C785"; // last column is the response p._activation = DeepLearningParameters.Activation.RectifierWithDropout; // p._activation = DeepLearningParameters.Activation.MaxoutWithDropout; p._hidden = new int[] {800, 800}; p._input_dropout_ratio = 0.2; p._mini_batch_size = 1; p._train_samples_per_iteration = 50000; p._score_duty_cycle = 0; // p._shuffle_training_data = true; // p._l1= 1e-5; // p._max_w2= 10; p._epochs = 10 * 5. / 6; // Convert response 'C785' to categorical (digits 1 to 10) int ci = frame.find("C785"); Scope.track(frame.replace(ci, frame.vecs()[ci].toEnum())._key); Scope.track(vframe.replace(ci, vframe.vecs()[ci].toEnum())._key); DKV.put(frame); DKV.put(vframe); // speed up training p._adaptive_rate = true; // disable adaptive per-weight learning rate -> default settings for learning rate // and momentum are probably not ideal (slow convergence) p._replicate_training_data = true; // avoid extra communication cost upfront, got enough data on each node for load // balancing p._overwrite_with_best_model = true; // no need to keep the best model around p._classification_stop = -1; p._score_interval = 60; // score and print progress report (only) every 20 seconds p._score_training_samples = 10000; // only score on a small sample of the training set -> don't want to spend too // much time scoring (note: there will be at least 1 row per chunk) DeepLearning dl = new DeepLearning(p); DeepLearningModel model = null; try { model = dl.trainModel().get(); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { dl.remove(); if (model != null) { model.delete(); } } } else { Log.info("Please run ./gradlew syncBigDataLaptop in the top-level directory of h2o-3."); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { Scope.exit(); } }