@Test public void testDomains() { Frame frame = parse_test_file("smalldata/junit/weather.csv"); for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) { Vec v = frame.vec(s); Vec newV = v.toCategoricalVec(); frame.remove(s); frame.add(s, newV); v.remove(); } DKV.put(frame); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 10; AggregatorModel agg = new Aggregator(parms).trainModel().get(); Frame output = agg._output._output_frame.get(); Assert.assertTrue(output.numRows() < 0.5 * frame.numRows()); boolean same = true; for (int i = 0; i < frame.numCols(); ++i) { if (frame.vec(i).isCategorical()) { same = (frame.domains()[i].length == output.domains()[i].length); if (!same) break; } } frame.remove(); output.remove(); agg.remove(); Assert.assertFalse(same); }
@Test public void testAggregatorBinary() { CreateFrame cf = new CreateFrame(); cf.rows = 1000; cf.cols = 10; cf.categorical_fraction = 0.6; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.real_range = 100; cf.integer_range = 100; cf.missing_fraction = 0.1; cf.factors = 5; cf.seed = 1234; Frame frame = cf.execImpl().get(); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 1.0; parms._transform = DataInfo.TransformType.NORMALIZE; parms._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.Binary; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.905 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); Frame output = agg._output._output_frame.get(); System.out.println(output.toTwoDimTable(0, 10)); Log.info("Number of exemplars: " + agg._exemplars.length); // Assert.assertTrue(agg._exemplars.length==649); output.remove(); frame.remove(); agg.remove(); }
// @Ignore("PUBDEV-1643") @Test public void testDuplicatesCarsGrid() { Grid grid = null; Frame fr = null; Vec old = null; try { fr = parse_test_file("smalldata/junit/cars_20mpg.csv"); fr.remove("name").remove(); // Remove unique id old = fr.remove("economy"); fr.add("economy", old); // response to last column DKV.put(fr); // Setup random hyperparameter search space HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() { { put("_ntrees", new Integer[] {5, 5}); put("_max_depth", new Integer[] {2, 2}); put("_mtries", new Integer[] {-1, -1}); put("_sample_rate", new Double[] {.1, .1}); } }; // Fire off a grid search DRFModel.DRFParameters params = new DRFModel.DRFParameters(); params._train = fr._key; params._response_column = "economy"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = gs.get(); // Check that duplicate model have not been constructed Model[] models = grid.getModels(); assertTrue("Number of returned models has to be > 0", models.length > 0); // But all off them should be same Key<Model> modelKey = models[0]._key; for (Model m : models) { assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key); } } finally { if (old != null) { old.remove(); } if (fr != null) { fr.remove(); } if (grid != null) { grid.remove(); } } }
@Test public void testChunks() { Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data"); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 3.0; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.418 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); Frame output = agg._output._output_frame.get(); Log.info("Number of exemplars: " + agg._exemplars.length); // Assert.assertTrue(agg._exemplars.length==1993); output.remove(); agg.remove(); for (int i : new int[] {1, 2, 5, 10, 50, 100}) { Key key = Key.make(); RebalanceDataSet rb = new RebalanceDataSet(frame, key, i); H2O.submitTask(rb); rb.join(); Frame rebalanced = DKV.get(key).get(); parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 3.0; start = System.currentTimeMillis(); AggregatorModel agg2 = new Aggregator(parms).trainModel().get(); // 0.373 0.504 0.357 0.454 0.368 0.355 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg2.checkConsistency(); Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length); rebalanced.delete(); Assert.assertTrue( Math.abs(agg._exemplars.length - agg2._exemplars.length) == 0); // < agg._exemplars.length*0); output = agg2._output._output_frame.get(); output.remove(); agg2.remove(); } frame.delete(); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Val v = stk.track(asts[1].exec(env)); if (v instanceof ValRow) { ValRow vv = (ValRow) v; return vv.slice(asts[2].columns(vv._names)); } Frame fr = v.getFrame(); int[] cols = asts[2].columns(fr.names()); Frame fr2 = new Frame(); if (cols.length == 0) { // Empty inclusion list? } else if (cols[0] >= 0) { // Positive (inclusion) list if (cols[cols.length - 1] > fr.numCols()) throw new IllegalArgumentException( "Column must be an integer from 0 to " + (fr.numCols() - 1)); for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]); } else { // Negative (exclusion) list fr2 = new Frame(fr); // All of them at first Arrays.sort(cols); // This loop depends on the values in sorted order for (int col : cols) if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column } return new ValFrame(fr2); }
public void dropInteractions() { // only called to cleanup the InteractionWrappedVecs! if (_interactions != null) { Vec[] vecs = _adaptedFrame.remove(_interactionVecs); for (Vec v : vecs) v.remove(); _interactions = null; } }
// Adapt a trained model to a test dataset with different enums /*@Test*/ public void testModelAdapt() { File file1 = TestUtil.find_test_file("./smalldata/kaggle/KDDTrain.arff.gz"); Key fkey1 = NFSFileVec.make(file1); Key dest1 = Key.make("KDDTrain.hex"); File file2 = TestUtil.find_test_file("./smalldata/kaggle/KDDTest.arff.gz"); Key fkey2 = NFSFileVec.make(file2); Key dest2 = Key.make("KDDTest.hex"); GBM gbm = null; Frame fr = null; try { gbm = new GBM(); gbm.source = ParseDataset2.parse(dest1, new Key[] {fkey1}); UKV.remove(fkey1); gbm.response = gbm.source.remove(41); // Response is col 41 gbm.ntrees = 2; gbm.max_depth = 8; gbm.learn_rate = 0.2f; gbm.min_rows = 10; gbm.nbins = 50; gbm.invoke(); // The test data set has a few more enums than the train Frame ftest = ParseDataset2.parse(dest2, new Key[] {fkey2}); Frame preds = gbm.score(ftest); } finally { UKV.remove(dest1); // Remove original hex frame key if (gbm != null) { UKV.remove(gbm.dest()); // Remove the model UKV.remove(gbm.response._key); gbm.remove(); // Remove GBM Job if (fr != null) fr.remove(); } } }
// ========================================================================== public void basicGBM(String fname, String hexname, PrepData prep) { File file = TestUtil.find_test_file(fname); if (file == null) return; // Silently abort test if the file is missing Key fkey = NFSFileVec.make(file); Key dest = Key.make(hexname); GBM gbm = null; Frame fr = null; try { gbm = new GBM(); gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey}); UKV.remove(fkey); int idx = prep.prep(fr); if (idx < 0) { gbm.classification = false; idx = ~idx; } String rname = fr._names[idx]; gbm.response = fr.vecs()[idx]; fr.remove(idx); // Move response to the end fr.add(rname, gbm.response); gbm.ntrees = 4; gbm.max_depth = 4; gbm.min_rows = 1; gbm.nbins = 50; gbm.cols = new int[fr.numCols()]; for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i; gbm.learn_rate = .2f; gbm.invoke(); fr = gbm.score(gbm.source); GBM.GBMModel gbmmodel = UKV.get(gbm.dest()); // System.out.println(gbmmodel.toJava()); } finally { UKV.remove(dest); // Remove original hex frame key if (gbm != null) { UKV.remove(gbm.dest()); // Remove the model UKV.remove(gbm.response._key); gbm.remove(); // Remove GBM Job if (fr != null) fr.remove(); } } }
@Test public void testCategoricalProstate() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS try { Scope.enter(); train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 8; parms._gamma_x = parms._gamma_y = 0.1; parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._init = GLRM.Initialization.PlusPlus; parms._transform = DataInfo.TransformType.STANDARDIZE; parms._recover_svd = false; parms._max_iterations = 200; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
@Test public void testCollisionOfDRFParamsChecksum() { Frame fr = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); Vec old = fr.remove("economy (mpg)"); fr.add("economy (mpg)", old); // response to last column DKV.put(fr); // {"_model_id":null,"_train":{"name":"_83da9e0754c5eb9f6b812fe17e7945e5","type":"Key"},"_valid":null,"_nfolds":0,"_keep_cross_validation_predictions":false,"_fold_assignment":"AUTO","_distribution":"AUTO","_tweedie_power":1.5,"_ignored_columns":null,"_ignore_const_cols":true,"_weights_column":null,"_offset_column":null,"_fold_column":null,"_score_each_iteration":false,"_response_column":"economy (mpg)","_balance_classes":false,"_max_after_balance_size":5.0,"_class_sampling_factors":null,"_max_hit_ratio_k":10,"_max_confusion_matrix_size":20,"_checkpoint":null,"_ntrees":9,"_max_depth":15,"_min_rows":1.0,"_nbins":20,"_nbins_cats":1024,"_r2_stopping":0.999999,"_seed":-4522296119273841674,"_nbins_top_level":1024,"_build_tree_one_node":false,"_initial_score_interval":4000,"_score_interval":4000,"_mtries":3,"_sample_rate":0.6499997,"_binomial_double_trees":false} DRFModel.DRFParameters params1 = new DRFModel.DRFParameters(); params1._train = fr._key; params1._response_column = "economy (mpg)"; params1._seed = -4522296119273841674L; params1._mtries = 3; params1._max_depth = 15; params1._ntrees = 9; params1._sample_rate = 0.6499997f; // {"_model_id":null,"_train":{"name":"_83da9e0754c5eb9f6b812fe17e7945e5","type":"Key"},"_valid":null,"_nfolds":0,"_keep_cross_validation_predictions":false,"_fold_assignment":"AUTO","_distribution":"AUTO","_tweedie_power":1.5,"_ignored_columns":null,"_ignore_const_cols":true,"_weights_column":null,"_offset_column":null,"_fold_column":null,"_score_each_iteration":false,"_response_column":"economy (mpg)","_balance_classes":false,"_max_after_balance_size":5.0,"_class_sampling_factors":null,"_max_hit_ratio_k":10,"_max_confusion_matrix_size":20,"_checkpoint":null,"_ntrees":13,"_max_depth":1,"_min_rows":1.0,"_nbins":20,"_nbins_cats":1024,"_r2_stopping":0.999999,"_seed":-4522296119273841674,"_nbins_top_level":1024,"_build_tree_one_node":false,"_initial_score_interval":4000,"_score_interval":4000,"_mtries":1,"_sample_rate":0.6499997,"_binomial_double_trees":false} DRFModel.DRFParameters params2 = new DRFModel.DRFParameters(); params2._train = fr._key; params2._response_column = "economy (mpg)"; params2._seed = -4522296119273841674L; params2._mtries = 1; params2._max_depth = 1; params2._ntrees = 13; params2._sample_rate = 0.6499997f; long csum1 = params1.checksum(); long csum2 = params2.checksum(); Assert.assertNotEquals("Checksums shoudl be different", csum1, csum2); } finally { if (fr != null) { fr.remove(); } } }
private void checkTree(String tree, boolean expectThrow) { // Frame r = frame(new double[][]{{-1},{1},{2},{3},{4},{5},{6},{254}}); // Key ahex = Key.make("a.hex"); // Frame fr = new Frame(ahex, null, new Vec[]{r.remove(0)}); // r.delete(); // DKV.put(ahex, fr); Frame fr = parse_test_file(Key.make("a.hex"), "smalldata/iris/iris_wheader.csv"); fr.remove(4).remove(); try { Val val = Exec.exec(tree); Assert.assertFalse(expectThrow); System.out.println(val.toString()); if (val instanceof ValFrame) { Frame fr2 = ((ValFrame) val)._fr; System.out.println(fr2.vec(0)); fr2.remove(); } } catch (IllegalArgumentException iae) { if (!expectThrow) throw iae; } finally { fr.delete(); } }
private void applyTrainingFrameSideEffects() { int numCols = _modelBuilderTrain.numCols(); String responseVecName = _modelBuilderTrain.names()[numCols - 1]; Vec responseVec = _modelBuilderTrain.remove(numCols - 1); final boolean use_weights_column = (_parms.weights_column != null); final boolean use_start_column = (_parms.start_column != null); if (use_weights_column) { Vec weightsVec = _parms.weights_column; int idxInRawFrame = _train.find(weightsVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find weightVec"); } String weightsVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(weightsVecName, weightsVec); } if (use_start_column) { Vec startVec = _parms.start_column; int idxInRawFrame = _train.find(startVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find startVec"); } String startVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(startVecName, startVec); } { Vec stopVec = _parms.stop_column; int idxInRawFrame = _train.find(stopVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find stopVec"); } String stopVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(stopVecName, stopVec); } _modelBuilderTrain.add(responseVecName, responseVec); }
@Test public void testCovtype() { Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data"); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 5.0; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.179 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); frame.delete(); Frame output = agg._output._output_frame.get(); Log.info("Exemplars: " + output.toString()); output.remove(); Log.info("Number of exemplars: " + agg._exemplars.length); // Assert.assertTrue(agg._exemplars.length==615); agg.remove(); }
@Ignore @Test public void testMNIST() { Frame frame = parse_test_file("bigdata/laptop/mnist/train.csv.gz"); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 100.0; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); frame.delete(); Frame output = agg._output._output_frame.get(); // Log.info("Exemplars: " + output); output.remove(); Log.info("Number of exemplars: " + agg._exemplars.length); agg.remove(); }
/** * Compute the L2 norm for each row of the frame * * @param fr Input frame * @return Vec containing L2 values for each row, is in K-V store */ public static Vec getL2(final Frame fr, final double[] scale) { // add workspace vec at end final int idx = fr.numCols(); assert (scale.length == idx) : "Mismatch for number of columns"; fr.add("L2", fr.anyVec().makeZero()); Vec res; try { new MRTask2() { @Override public void map(Chunk[] cs) { for (int r = 0; r < cs[0]._len; r++) { double norm2 = 0; for (int i = 0; i < idx; i++) norm2 += Math.pow(cs[i].at0(r) * scale[i], 2); cs[idx].set0(r, Math.sqrt(norm2)); } } }.doAll(fr); } finally { res = fr.remove(idx); } res.rollupStats(); return res; }
private void applyScoringFrameSideEffects() { final int offset_ncol = _parms.offset_columns == null ? 0 : _parms.offset_columns.length; if (offset_ncol == 0) { return; } int numCols = _modelBuilderTrain.numCols(); String responseVecName = _modelBuilderTrain.names()[numCols - 1]; Vec responseVec = _modelBuilderTrain.remove(numCols - 1); for (int i = 0; i < offset_ncol; i++) { Vec offsetVec = _parms.offset_columns[i]; int idxInRawFrame = _train.find(offsetVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find offsetVec"); } String offsetVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(offsetVecName, offsetVec); } _modelBuilderTrain.add(responseVecName, responseVec); }
@Ignore @Test public void testCovtypeMemberIndices() { Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data"); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 5.0; long start = System.currentTimeMillis(); AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 1.489 System.out.println( "AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds"); agg.checkConsistency(); // Frame assignment = new Frame(new Vec[]{(Vec)agg._exemplar_assignment_vec_key.get()}); // Frame.export(assignment, "/tmp/assignment", "yada", true); // Log.info("Exemplars: " + new Frame(new // Vec[]{(Vec)agg._exemplar_assignment_vec_key.get()}).toString(0,20000)); Log.info("Number of exemplars: " + agg._exemplars.length); Key<Frame> memberKey = Key.make(); for (int i = 0; i < agg._exemplars.length; ++i) { Frame members = agg.scoreExemplarMembers(memberKey, i); assert (members.numRows() == agg._counts[i]); // Log.info(members); members.delete(); } Frame output = agg._output._output_frame.get(); output.remove(); Log.info("Number of exemplars: " + agg._exemplars.length); // Assert.assertTrue(agg._exemplars.length==615); frame.delete(); agg.remove(); }
public Frame extractFrame(int startIdx, int endIdx) { Frame f = subframe(startIdx, endIdx); remove(startIdx, endIdx); return f; }
// @Ignore("PUBDEV-1648") @Test public void testRandomCarsGrid() { Grid grid = null; DRFModel drfRebuilt = null; Frame fr = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); Vec old = fr.remove("economy (mpg)"); fr.add("economy (mpg)", old); // response to last column DKV.put(fr); // Setup random hyperparameter search space HashMap<String, Object[]> hyperParms = new HashMap<>(); // Construct random grid search space long seed = System.nanoTime(); Random rng = new Random(seed); // Limit to 1-3 randomly, 4 times. Average total number of models is // 2^4, or 16. Max is 81 models. Integer ntreesDim = rng.nextInt(3) + 1; Integer maxDepthDim = rng.nextInt(3) + 1; Integer mtriesDim = rng.nextInt(3) + 1; Integer sampleRateDim = rng.nextInt(3) + 1; Integer[] ntreesArr = interval(1, 15); ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr)); Collections.shuffle(ntreesList); Integer[] ntreesSpace = new Integer[ntreesDim]; for (int i = 0; i < ntreesDim; i++) { ntreesSpace[i] = ntreesList.get(i); } Integer[] maxDepthArr = interval(1, 10); ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr)); Collections.shuffle(maxDepthList); Integer[] maxDepthSpace = new Integer[maxDepthDim]; for (int i = 0; i < maxDepthDim; i++) { maxDepthSpace[i] = maxDepthList.get(i); } Integer[] mtriesArr = interval(1, 5); ArrayList<Integer> mtriesList = new ArrayList<>(Arrays.asList(mtriesArr)); Collections.shuffle(mtriesList); Integer[] mtriesSpace = new Integer[mtriesDim]; for (int i = 0; i < mtriesDim; i++) { mtriesSpace[i] = mtriesList.get(i); } Double[] sampleRateArr = interval(0.01, 0.99, 0.01); ArrayList<Double> sampleRateList = new ArrayList<>(Arrays.asList(sampleRateArr)); Collections.shuffle(sampleRateList); Double[] sampleRateSpace = new Double[sampleRateDim]; for (int i = 0; i < sampleRateDim; i++) { sampleRateSpace[i] = sampleRateList.get(i); } hyperParms.put("_ntrees", ntreesSpace); hyperParms.put("_max_depth", maxDepthSpace); hyperParms.put("_mtries", mtriesSpace); hyperParms.put("_sample_rate", sampleRateSpace); // Fire off a grid search DRFModel.DRFParameters params = new DRFModel.DRFParameters(); params._train = fr._key; params._response_column = "economy (mpg)"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = gs.get(); System.out.println("Test seed: " + seed); System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace)); System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace)); System.out.println("mtries search space: " + Arrays.toString(mtriesSpace)); System.out.println("sample_rate search space: " + Arrays.toString(sampleRateSpace)); // Check that cardinality of grid Model[] ms = grid.getModels(); int numModels = ms.length; System.out.println("Grid consists of " + numModels + " models"); assertEquals( "Number of models should match hyper space size", numModels, ntreesDim * maxDepthDim * sampleRateDim * mtriesDim + grid.getFailureCount()); // Pick a random model from the grid HashMap<String, Object[]> randomHyperParms = new HashMap<>(); Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)]; randomHyperParms.put("_ntrees", new Integer[] {ntreeVal}); Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)]; randomHyperParms.put("_max_depth", maxDepthSpace); Integer mtriesVal = mtriesSpace[rng.nextInt(mtriesSpace.length)]; randomHyperParms.put("_max_depth", mtriesSpace); Double sampleRateVal = sampleRateSpace[rng.nextInt(sampleRateSpace.length)]; randomHyperParms.put("_sample_rate", sampleRateSpace); // TODO: DRFModel drfFromGrid = (DRFModel) g2.model(randomHyperParms).get(); // Rebuild it with it's parameters params._ntrees = ntreeVal; params._max_depth = maxDepthVal; params._mtries = mtriesVal; drfRebuilt = new DRF(params).trainModel().get(); // Make sure the MSE metrics match // double fromGridMSE = drfFromGrid._output._scored_train[drfFromGrid._output._ntrees]._mse; double rebuiltMSE = drfRebuilt._output._scored_train[drfRebuilt._output._ntrees]._mse; // System.out.println("The random grid model's MSE: " + fromGridMSE); System.out.println("The rebuilt model's MSE: " + rebuiltMSE); // assertEquals(fromGridMSE, rebuiltMSE); } finally { if (fr != null) { fr.remove(); } if (grid != null) { grid.remove(); } if (drfRebuilt != null) { drfRebuilt.remove(); } } }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return deep_clone(); int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = (cols[i++] - _catOffsets[j]) + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } int[] catModes = _catModes; for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] cs = new int[_cats - ignoredCnt][]; catModes = new int[_cats - ignoredCnt]; int y = 0; for (int c = 0; c < catLvls.length; ++c) if (catLvls[c] != null) { catModes[y] = _catModes[c]; cs[y++] = catLvls[c]; } assert y == cs.length; catLvls = cs; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id - hasIcpt; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off]; } // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, // TransformType predictor_transform, TransformType response_transform, boolean skipMissing, // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes); dinfo._activeCols = cols; return dinfo; }
@Test public void testLosses() throws InterruptedException, ExecutionException { long seed = 0xDECAF; Random rng = new Random(seed); Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS final GLRMParameters.Regularizer[] regs = new GLRMParameters.Regularizer[] { GLRMParameters.Regularizer.Quadratic, GLRMParameters.Regularizer.L1, GLRMParameters.Regularizer.NonNegative, GLRMParameters.Regularizer.OneSparse, GLRMParameters.Regularizer.UnitOneSparse, GLRMParameters.Regularizer.Simplex }; Scope.enter(); try { train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); for (GLRMParameters.Loss loss : new GLRMParameters.Loss[] { GLRMParameters.Loss.Quadratic, GLRMParameters.Loss.Absolute, GLRMParameters.Loss.Huber, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Hinge, GLRMParameters.Loss.Logistic }) { for (GLRMParameters.Loss multiloss : new GLRMParameters.Loss[] { GLRMParameters.Loss.Categorical, GLRMParameters.Loss.Ordinal }) { GLRMModel model = null; try { Scope.enter(); long myseed = rng.nextLong(); Log.info("GLRM using seed = " + myseed); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._transform = DataInfo.TransformType.NONE; parms._k = 5; parms._loss = loss; parms._multi_loss = multiloss; parms._init = GLRM.Initialization.SVD; parms._regularization_x = regs[rng.nextInt(regs.length)]; parms._regularization_y = regs[rng.nextInt(regs.length)]; parms._gamma_x = Math.abs(rng.nextDouble()); parms._gamma_y = Math.abs(rng.nextDouble()); parms._recover_svd = false; parms._seed = myseed; parms._verbose = false; parms._max_iterations = 500; GLRM job = new GLRM(parms); try { model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { throw t; } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (model != null) model.delete(); Scope.exit(); } } } } finally { if (train != null) train.delete(); Scope.exit(); } }
@Test public void testChicago() { Frame weather = null, crimes = null, census = null; String oldtz = Exec.exec("(getTimeZone)").getStr(); try { weather = parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv"); crimes = parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip"); String fname = "smalldata/chicago/chicagoCensus.csv"; File f = find_test_file(fname); assert f != null && f.exists() : " file not found: " + fname; NFSFileVec nfs = NFSFileVec.make(f); ParseSetup ps = ParseSetup.guessSetup(new Key[] {nfs._key}, false, 1); ps.getColumnTypes()[1] = Vec.T_ENUM; census = ParseDataset.parse(Key.make("census.hex"), new Key[] {nfs._key}, true, ps); census = exec_str( "(colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area.Number\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"])", "census.hex"); crimes = exec_str( "(colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"])", "crimes.hex"); exec_str("(setTimeZone \"Etc/UTC\")", null); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (as.Date (cols crimes.hex [2]) \"%m/%d/%Y %I:%M:%S %p\")))) [22] [0:9999]) 22 \"Day\")", "crimes.hex"); crimes = exec_str( "(colnames= (= crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) #1)) [23] [0:9999]) 23 \"Month\")", "crimes.hex"); Keyed.remove(Key.make("nary_op_30")); crimes = exec_str( "(colnames= (= crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) #1900)) #1900)) [17] [0:9999]) 17 \"Year\")", "crimes.hex"); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_10 (week nary_op_5)) [24] [0:9999]) 24 \"WeekNum\")", "crimes.hex"); Keyed.remove(Key.make("binary_op_32")); Keyed.remove(Key.make("binary_op_31")); Keyed.remove(Key.make("unary_op_8")); checkSaneFrame(); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) [25] [0:9999]) 25 \"WeekDay\")", "crimes.hex"); Keyed.remove( Key.make( "nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-3\\smalldata\\chicago\\chicagoCrimes10k.csv.zip")); crimes = exec_str( "(colnames= (= crimes.hex (tmp= unary_op_12 (hour nary_op_5)) [26] [0:9999]) 26 \"HourOfDay\")", "crimes.hex"); crimes = exec_str( "(colnames= (= crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) [27] [0:9999]) 27 \"Weekend\")", "crimes.hex"); // Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag crimes = exec_str( "(colnames= (= crimes.hex nary_op_16 [28] [0:9999]) 28 \"Season\")", "crimes.hex"); // Standard "head of 10 rows" pattern for printing Frame subset_33 = exec_str("(rows crimes.hex [0:10])", "subset_33"); Keyed.remove(Key.make("subset_33")); Keyed.remove(Key.make("subset_33")); Keyed.remove(Key.make("unary_op_29")); Keyed.remove(Key.make("nary_op_28")); Keyed.remove(Key.make("nary_op_27")); Keyed.remove(Key.make("nary_op_26")); Keyed.remove(Key.make("binary_op_25")); Keyed.remove(Key.make("binary_op_24")); Keyed.remove(Key.make("binary_op_23")); Keyed.remove(Key.make("binary_op_22")); Keyed.remove(Key.make("binary_op_21")); Keyed.remove(Key.make("binary_op_20")); Keyed.remove(Key.make("binary_op_19")); Keyed.remove(Key.make("binary_op_18")); Keyed.remove(Key.make("binary_op_17")); Keyed.remove(Key.make("nary_op_16")); Keyed.remove(Key.make("binary_op_15")); Keyed.remove(Key.make("binary_op_14")); Keyed.remove(Key.make("binary_op_13")); Keyed.remove(Key.make("unary_op_12")); Keyed.remove(Key.make("unary_op_11")); Keyed.remove(Key.make("unary_op_10")); Keyed.remove(Key.make("binary_op_9")); Keyed.remove(Key.make("unary_op_8")); Keyed.remove(Key.make("unary_op_7")); Keyed.remove(Key.make("unary_op_6")); Keyed.remove(Key.make("nary_op_5")); checkSaneFrame(); // Standard "head of 10 rows" pattern for printing Frame subset_34 = exec_str("(rows crimes.hex [0:10])", "subset_34"); Keyed.remove(Key.make("subset_34")); census = exec_str( "(colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"])", "census.hex"); Keyed.remove(Key.make("subset_34")); Frame subset_35 = exec_str("(cols crimes.hex [-3])", "subset_35"); Frame subset_36 = exec_str("(cols weather.hex [-1])", "subset_36"); subset_36 = exec_str( "(colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"])", "subset_36"); crimes.remove(); weather.remove(); // nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared Frame nary_op_37 = exec_str("(merge subset_35 census.hex FALSE FALSE)", "nary_op_37"); // nary_op_38 = merge( nary_op_37 subset_36); Vecs in nary_op_38 and nary_pop_37 and X shared Frame subset_41 = exec_str( "(rows (tmp= nary_op_38 (merge nary_op_37 subset_36 TRUE FALSE)) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) #0.8)))", "subset_41"); // Standard "head of 10 rows" pattern for printing Frame subset_44 = exec_str("(rows subset_41 [0:10])", "subset_44"); Keyed.remove(Key.make("subset_44")); Keyed.remove(Key.make("subset_44")); Keyed.remove(Key.make("binary_op_40")); Keyed.remove(Key.make("nary_op_37")); Frame subset_43 = exec_str("(rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 #0.8)))", "subset_43"); // Chicago demo continues on past, but this is all I've captured for now checkSaneFrame(); } finally { Exec.exec( "(setTimeZone \"" + oldtz + "\")"); // Restore time zone (which is global, and will affect following tests) if (weather != null) weather.remove(); if (crimes != null) crimes.remove(); if (census != null) census.remove(); for (String s : new String[] { "nary_op_5", "unary_op_6", "unary_op_7", "unary_op_8", "binary_op_9", "unary_op_10", "unary_op_11", "unary_op_12", "binary_op_13", "binary_op_14", "binary_op_15", "nary_op_16", "binary_op_17", "binary_op_18", "binary_op_19", "binary_op_20", "binary_op_21", "binary_op_22", "binary_op_23", "binary_op_24", "binary_op_25", "nary_op_26", "nary_op_27", "nary_op_28", "unary_op_29", "binary_op_30", "binary_op_31", "binary_op_32", "subset_33", "subset_34", "subset_35", "subset_36", "nary_op_37", "nary_op_38", "nary_op_39", "binary_op_40", "subset_41", "binary_op_42", "subset_43", "subset_44", }) Keyed.remove(Key.make(s)); } }
@Test public void testSetColumnLossCats() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS Scope.enter(); try { train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 12; parms._loss = GLRMParameters.Loss.Quadratic; parms._multi_loss = GLRMParameters.Loss.Categorical; parms._loss_by_col = new GLRMParameters.Loss[] { GLRMParameters.Loss.Ordinal, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Absolute }; parms._loss_by_col_idx = new int[] {3 /* DPROS */, 1 /* AGE */, 6 /* VOL */}; parms._init = GLRM.Initialization.PlusPlus; parms._min_step_size = 1e-5; parms._recover_svd = false; parms._max_iterations = 2000; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); GLRMTest.checkLossbyCol(parms, model); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
@Test public void testExpandCatsProstate() throws InterruptedException, ExecutionException { double[][] prostate = ard( ard(0, 71, 1, 0, 0, 4.8, 14.0, 7), ard(1, 70, 1, 1, 0, 8.4, 21.8, 5), ard(0, 73, 1, 3, 0, 10.0, 27.4, 6), ard(1, 68, 1, 0, 0, 6.7, 16.7, 6)); double[][] pros_expandR = ard( ard(1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 71, 4.8, 14.0, 7), ard(0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 70, 8.4, 21.8, 5), ard(0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 73, 10.0, 27.4, 6), ard(1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 68, 6.7, 16.7, 6)); String[] pros_cols = new String[] {"Capsule", "Age", "Race", "Dpros", "Dcaps", "PSA", "Vol", "Gleason"}; String[][] pros_domains = new String[][] { new String[] {"No", "Yes"}, null, new String[] {"Other", "White", "Black"}, new String[] {"None", "UniLeft", "UniRight", "Bilobar"}, new String[] {"No", "Yes"}, null, null, null }; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS Frame fr = null; try { Scope.enter(); fr = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(fr.replace(cats[i], fr.vec(cats[i]).toCategoricalVec())._key); fr.remove("ID").remove(); DKV.put(fr._key, fr); DataInfo dinfo = new DataInfo( Key.make(), fr, null, 0, true, DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, false, false, false, /* weights */ false, /* offset */ false, /* fold */ false); Log.info("Original matrix:\n" + colFormat(pros_cols, "%8.7s") + ArrayUtils.pprint(prostate)); double[][] pros_perm = ArrayUtils.permuteCols(prostate, dinfo._permutation); Log.info( "Permuted matrix:\n" + colFormat(pros_cols, "%8.7s", dinfo._permutation) + ArrayUtils.pprint(pros_perm)); double[][] pros_exp = GLRM.expandCats(pros_perm, dinfo); Log.info( "Expanded matrix:\n" + colExpFormat(pros_cols, pros_domains, "%8.7s", dinfo._permutation) + ArrayUtils.pprint(pros_exp)); Assert.assertArrayEquals(pros_expandR, pros_exp); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (fr != null) fr.delete(); Scope.exit(); } }
void testModelAdaptation(String train, String test, PrepData dprep, boolean exactAdaptation) { DRFModel model = null; Frame frTest = null; Frame frTrain = null; Key trainKey = Key.make("train.hex"); Key testKey = Key.make("test.hex"); Frame[] frAdapted = null; try { // Prepare a simple model frTrain = parseFrame(trainKey, train); model = runDRF(frTrain, dprep); // Load test dataset - test data contains input columns matching train data, // BUT each input requires adaptation. Moreover, test data contains additional columns // containing correct value mapping. frTest = parseFrame(testKey, test); Assert.assertEquals( "TEST CONF ERROR: The test dataset should contain 2*<number of input columns>+1!", 2 * (frTrain.numCols() - 1) + 1, frTest.numCols()); // Adapt test dataset frAdapted = model.adapt(frTest, exactAdaptation); // do/do not perform translation to enums Assert.assertEquals("Adapt method should return two frames", 2, frAdapted.length); Assert.assertEquals( "Test expects that all columns in test dataset has to be adapted", dprep.needAdaptation(frTrain), frAdapted[1].numCols()); // Compare vectors Frame adaptedFrame = frAdapted[0]; // System.err.println(frTest.toStringAll()); // System.err.println(adaptedFrame.toStringAll()); for (int av = 0; av < frTrain.numCols() - 1; av++) { int ev = av + frTrain.numCols(); Vec actV = adaptedFrame.vecs()[av]; Vec expV = frTest.vecs()[ev]; Assert.assertEquals( "Different number of rows in test vectors", expV.length(), actV.length()); for (long r = 0; r < expV.length(); r++) { if (expV.isNA(r)) Assert.assertTrue( "Badly adapted vector - expected NA! Col: " + av + ", row: " + r, actV.isNA(r)); else { Assert.assertTrue( "Badly adapted vector - expected value but get NA! Col: " + av + ", row: " + r, !actV.isNA(r)); Assert.assertEquals( "Badly adapted vector - wrong values! Col: " + av + ", row: " + r, expV.at8(r), actV.at8(r)); } } } } finally { // Test cleanup if (model != null) UKV.remove(model._selfKey); if (frTrain != null) frTrain.remove(); UKV.remove(trainKey); if (frTest != null) frTest.remove(); UKV.remove(testKey); // Remove adapted vectors which were saved into KV-store, rest of vectors are remove by // frTest.remove() if (frAdapted != null) frAdapted[1].remove(); } }
public void dropWeights() { if (!_weights) return; _adaptedFrame.remove(weightChunkId()); _weights = false; }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return this; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = cols[i++] - _catOffsets[j] + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] c = new int[_cats - ignoredCnt][]; int y = 0; for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl; assert y == c.length; catLvls = c; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off]; } DataInfo dinfo = new DataInfo( _key, f, normMul, normSub, catLvls, _responses, _predictor_transform, _response_transform, _skipMissing, _imputeMissing, _weights, _offset, _fold); // do not put activeData into K/V - active data is recreated on each node based on active // columns dinfo._activeCols = cols; return dinfo; }
// @Ignore("PUBDEV-1648") @Test public void testRandomCarsGrid() { Grid grid = null; GBMModel gbmRebuilt = null; Frame fr = null; Vec old = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); old = fr.remove("economy (mpg)"); fr.add("economy (mpg)", old); // response to last column DKV.put(fr); // Setup random hyperparameter search space HashMap<String, Object[]> hyperParms = new HashMap<>(); hyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian}); // Construct random grid search space Random rng = new Random(); Integer ntreesDim = rng.nextInt(4) + 1; Integer maxDepthDim = rng.nextInt(4) + 1; Integer learnRateDim = rng.nextInt(4) + 1; Integer[] ntreesArr = interval(1, 25); ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr)); Collections.shuffle(ntreesList); Integer[] ntreesSpace = new Integer[ntreesDim]; for (int i = 0; i < ntreesDim; i++) { ntreesSpace[i] = ntreesList.get(i); } Integer[] maxDepthArr = interval(1, 10); ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr)); Collections.shuffle(maxDepthList); Integer[] maxDepthSpace = new Integer[maxDepthDim]; for (int i = 0; i < maxDepthDim; i++) { maxDepthSpace[i] = maxDepthList.get(i); } Double[] learnRateArr = interval(0.01, 1.0, 0.01); ArrayList<Double> learnRateList = new ArrayList<>(Arrays.asList(learnRateArr)); Collections.shuffle(learnRateList); Double[] learnRateSpace = new Double[learnRateDim]; for (int i = 0; i < learnRateDim; i++) { learnRateSpace[i] = learnRateList.get(i); } hyperParms.put("_ntrees", ntreesSpace); hyperParms.put("_max_depth", maxDepthSpace); hyperParms.put("_learn_rate", learnRateSpace); // Fire off a grid search GBMModel.GBMParameters params = new GBMModel.GBMParameters(); params._train = fr._key; params._response_column = "economy (mpg)"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = gs.get(); System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace)); System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace)); System.out.println("learn_rate search space: " + Arrays.toString(learnRateSpace)); // Check that cardinality of grid Model[] ms = grid.getModels(); Integer numModels = ms.length; System.out.println("Grid consists of " + numModels + " models"); assertTrue(numModels == ntreesDim * maxDepthDim * learnRateDim); // Pick a random model from the grid HashMap<String, Object[]> randomHyperParms = new HashMap<>(); randomHyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian}); Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)]; randomHyperParms.put("_ntrees", new Integer[] {ntreeVal}); Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)]; randomHyperParms.put("_max_depth", maxDepthSpace); Double learnRateVal = learnRateSpace[rng.nextInt(learnRateSpace.length)]; randomHyperParms.put("_learn_rate", learnRateSpace); // TODO: GBMModel gbmFromGrid = (GBMModel) g2.model(randomHyperParms).get(); // Rebuild it with it's parameters params._distribution = DistributionFamily.gaussian; params._ntrees = ntreeVal; params._max_depth = maxDepthVal; params._learn_rate = learnRateVal; GBM gbm = new GBM(params); gbmRebuilt = gbm.trainModel().get(); assertTrue(gbm.isStopped()); // Make sure the MSE metrics match // double fromGridMSE = gbmFromGrid._output._scored_train[gbmFromGrid._output._ntrees]._mse; double rebuiltMSE = gbmRebuilt._output._scored_train[gbmRebuilt._output._ntrees]._mse; // System.out.println("The random grid model's MSE: " + fromGridMSE); System.out.println("The rebuilt model's MSE: " + rebuiltMSE); // assertEquals(fromGridMSE, rebuiltMSE); } finally { if (old != null) old.remove(); if (fr != null) fr.remove(); if (grid != null) grid.remove(); if (gbmRebuilt != null) gbmRebuilt.remove(); } }
@Test public void testCarsGrid() { Grid<GBMModel.GBMParameters> grid = null; Frame fr = null; Vec old = null; try { fr = parse_test_file("smalldata/junit/cars.csv"); fr.remove("name").remove(); // Remove unique id old = fr.remove("cylinders"); fr.add("cylinders", old.toCategoricalVec()); // response to last column DKV.put(fr); // Setup hyperparameter search space final Double[] legalLearnRateOpts = new Double[] {0.01, 0.1, 0.3}; final Double[] illegalLearnRateOpts = new Double[] {-1.0}; HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() { { put("_ntrees", new Integer[] {1, 2}); put("_distribution", new DistributionFamily[] {DistributionFamily.multinomial}); put("_max_depth", new Integer[] {1, 2, 5}); put("_learn_rate", ArrayUtils.join(legalLearnRateOpts, illegalLearnRateOpts)); } }; // Name of used hyper parameters String[] hyperParamNames = hyperParms.keySet().toArray(new String[hyperParms.size()]); Arrays.sort(hyperParamNames); int hyperSpaceSize = ArrayUtils.crossProductSize(hyperParms); // Fire off a grid search GBMModel.GBMParameters params = new GBMModel.GBMParameters(); params._train = fr._key; params._response_column = "cylinders"; // Get the Grid for this modeling class and frame Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms); grid = (Grid<GBMModel.GBMParameters>) gs.get(); // Make sure number of produced models match size of specified hyper space Assert.assertEquals( "Size of grid (models+failures) should match to size of hyper space", hyperSpaceSize, grid.getModelCount() + grid.getFailureCount()); // // Make sure that names of used parameters match // String[] gridHyperNames = grid.getHyperNames(); Arrays.sort(gridHyperNames); Assert.assertArrayEquals( "Hyper parameters names should match!", hyperParamNames, gridHyperNames); // // Make sure that values of used parameters match as well to the specified values // Key<Model>[] mKeys = grid.getModelKeys(); Map<String, Set<Object>> usedHyperParams = GridTestUtils.initMap(hyperParamNames); for (Key<Model> mKey : mKeys) { GBMModel gbm = (GBMModel) mKey.get(); System.out.println( gbm._output._scored_train[gbm._output._ntrees]._mse + " " + Arrays.deepToString( ArrayUtils.zip(grid.getHyperNames(), grid.getHyperValues(gbm._parms)))); GridTestUtils.extractParams(usedHyperParams, gbm._parms, hyperParamNames); } // Remove illegal options hyperParms.put("_learn_rate", legalLearnRateOpts); GridTestUtils.assertParamsEqual( "Grid models parameters have to cover specified hyper space", hyperParms, usedHyperParams); // Verify model failure Map<String, Set<Object>> failedHyperParams = GridTestUtils.initMap(hyperParamNames); ; for (Model.Parameters failedParams : grid.getFailedParameters()) { GridTestUtils.extractParams(failedHyperParams, failedParams, hyperParamNames); } hyperParms.put("_learn_rate", illegalLearnRateOpts); GridTestUtils.assertParamsEqual( "Failed model parameters have to correspond to specified hyper space", hyperParms, failedHyperParams); } finally { if (old != null) { old.remove(); } if (fr != null) { fr.remove(); } if (grid != null) { grid.remove(); } } }
public void remove() { remove(new Futures()); }