@Test public void testCategoricalProstate() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS try { Scope.enter(); train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 8; parms._gamma_x = parms._gamma_y = 0.1; parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._init = GLRM.Initialization.PlusPlus; parms._transform = DataInfo.TransformType.STANDARDIZE; parms._recover_svd = false; parms._max_iterations = 200; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
@Test public void testGroupbyTableSpeed() { Frame ids = parse_test_file(Key.make("cov"), "smalldata/junit/id_cols.csv"); ids.replace(0, ids.anyVec().toCategoricalVec()).remove(); System.out.println(ids.toString(0, 10)); long start = System.currentTimeMillis(); Val v_gb = Exec.exec("(GB cov [0] nrow 0 \"all\")"); System.out.println("GB Time= " + (System.currentTimeMillis() - start) + "msec"); System.out.println(v_gb.toString()); ((ValFrame) v_gb)._fr.delete(); long start2 = System.currentTimeMillis(); Val v_tb = Exec.exec("(table cov FALSE)"); System.out.println("Table Time= " + (System.currentTimeMillis() - start2) + "msec"); System.out.println(v_tb.toString()); ((ValFrame) v_tb)._fr.delete(); ids.delete(); }
/** * This method trains a stacked autoencoder * * @param trainData Training dataset as a JavaRDD * @param batchSize Size of a training mini-batch * @param layerSizes Number of neurons for each layer * @param epochs Number of epochs to train * @param responseColumn Name of the response column * @param modelName Name of the model * @return DeepLearningModel */ public DeepLearningModel train( JavaRDD<LabeledPoint> trainData, int batchSize, int[] layerSizes, String activationType, int epochs, String responseColumn, String modelName, MLModel mlModel, long modelID) { // build stacked autoencoder by training the model with training data double trainingFraction = 1; try { Scope.enter(); if (trainData != null) { int numberOfFeatures = mlModel.getFeatures().size(); List<Feature> features = mlModel.getFeatures(); String[] names = new String[numberOfFeatures + 1]; for (int i = 0; i < numberOfFeatures; i++) { names[i] = features.get(i).getName(); } names[numberOfFeatures] = mlModel.getResponseVariable(); Frame frame = DeeplearningModelUtils.javaRDDToFrame(names, trainData); // H2O uses default C<x> for column header // String classifColName = "C" + frame.numCols(); String classifColName = mlModel.getResponseVariable(); // Convert response to categorical (digits 1 to <num of columns>) int ci = frame.find(classifColName); Scope.track(frame.replace(ci, frame.vecs()[ci].toEnum())._key); // Splitting train file to train, validation and test // Using FrameSplitter (instead of SuffleSplitFrame) gives a weird exception // barrier onExCompletion for hex.deeplearning.DeepLearning$DeepLearningDriver@78ec854 double[] ratios = new double[] {trainingFraction, 1 - trainingFraction}; @SuppressWarnings("unchecked") Frame[] splits = ShuffleSplitFrame.shuffleSplitFrame( frame, generateNumKeys(frame._key, ratios.length), ratios, 123456789); Frame trainFrame = splits[0]; Frame vframe = splits[1]; if (log.isDebugEnabled()) { log.debug("Creating Deeplearning parameters"); } DeepLearningParameters deeplearningParameters = new DeepLearningParameters(); // convert model name String dlModelName = modelName.replace('.', '_').replace('-', '_'); // populate model parameters deeplearningParameters._model_id = Key.make(dlModelName + "_dl"); deeplearningParameters._train = trainFrame._key; deeplearningParameters._valid = vframe._key; deeplearningParameters._response_column = classifColName; // last column is the response // This is causin all the predictions to be 0.0 // p._autoencoder = true; deeplearningParameters._activation = getActivationType(activationType); deeplearningParameters._hidden = layerSizes; deeplearningParameters._train_samples_per_iteration = batchSize; deeplearningParameters._input_dropout_ratio = 0.2; deeplearningParameters._l1 = 1e-5; deeplearningParameters._max_w2 = 10; deeplearningParameters._epochs = epochs; // speed up training deeplearningParameters._adaptive_rate = true; // disable adaptive per-weight learning rate -> default // settings for learning rate and momentum are probably // not ideal (slow convergence) deeplearningParameters._replicate_training_data = true; // avoid extra communication cost upfront, got // enough data on each node for load balancing deeplearningParameters._overwrite_with_best_model = true; // no need to keep the best model around deeplearningParameters._diagnostics = false; // no need to compute statistics during training deeplearningParameters._classification_stop = -1; deeplearningParameters._score_interval = 60; // score and print progress report (only) every 20 seconds deeplearningParameters._score_training_samples = batchSize / 10; // only score on a small sample of the // training set -> don't want to spend // too much time scoring (note: there // will be at least 1 row per chunk) DKV.put(trainFrame); DKV.put(vframe); deeplearning = new DeepLearning(deeplearningParameters); if (log.isDebugEnabled()) { log.debug("Start training deeplearning model ...."); } try { dlModel = deeplearning.trainModel().get(); if (log.isDebugEnabled()) { log.debug("Successfully finished Training deeplearning model."); } } catch (RuntimeException ex) { log.error("Error in training Stacked Autoencoder classifier model", ex); } } else { log.error("Train file not found!"); } } catch (RuntimeException ex) { log.error("Failed to train the deeplearning model [id] " + modelID + ". " + ex.getMessage()); } finally { Scope.exit(); } return dlModel; }
public Vec setWeights(String name, Vec vec) { if (_weights) return _adaptedFrame.replace(weightChunkId(), vec); _adaptedFrame.insertVec(weightChunkId(), name, vec); _weights = true; return null; }
@Test public void testExpandCatsProstate() throws InterruptedException, ExecutionException { double[][] prostate = ard( ard(0, 71, 1, 0, 0, 4.8, 14.0, 7), ard(1, 70, 1, 1, 0, 8.4, 21.8, 5), ard(0, 73, 1, 3, 0, 10.0, 27.4, 6), ard(1, 68, 1, 0, 0, 6.7, 16.7, 6)); double[][] pros_expandR = ard( ard(1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 71, 4.8, 14.0, 7), ard(0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 70, 8.4, 21.8, 5), ard(0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 73, 10.0, 27.4, 6), ard(1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 68, 6.7, 16.7, 6)); String[] pros_cols = new String[] {"Capsule", "Age", "Race", "Dpros", "Dcaps", "PSA", "Vol", "Gleason"}; String[][] pros_domains = new String[][] { new String[] {"No", "Yes"}, null, new String[] {"Other", "White", "Black"}, new String[] {"None", "UniLeft", "UniRight", "Bilobar"}, new String[] {"No", "Yes"}, null, null, null }; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS Frame fr = null; try { Scope.enter(); fr = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(fr.replace(cats[i], fr.vec(cats[i]).toCategoricalVec())._key); fr.remove("ID").remove(); DKV.put(fr._key, fr); DataInfo dinfo = new DataInfo( Key.make(), fr, null, 0, true, DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, false, false, false, /* weights */ false, /* offset */ false, /* fold */ false); Log.info("Original matrix:\n" + colFormat(pros_cols, "%8.7s") + ArrayUtils.pprint(prostate)); double[][] pros_perm = ArrayUtils.permuteCols(prostate, dinfo._permutation); Log.info( "Permuted matrix:\n" + colFormat(pros_cols, "%8.7s", dinfo._permutation) + ArrayUtils.pprint(pros_perm)); double[][] pros_exp = GLRM.expandCats(pros_perm, dinfo); Log.info( "Expanded matrix:\n" + colExpFormat(pros_cols, pros_domains, "%8.7s", dinfo._permutation) + ArrayUtils.pprint(pros_exp)); Assert.assertArrayEquals(pros_expandR, pros_exp); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (fr != null) fr.delete(); Scope.exit(); } }
@Test public void testSetColumnLossCats() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS Scope.enter(); try { train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 12; parms._loss = GLRMParameters.Loss.Quadratic; parms._multi_loss = GLRMParameters.Loss.Categorical; parms._loss_by_col = new GLRMParameters.Loss[] { GLRMParameters.Loss.Ordinal, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Absolute }; parms._loss_by_col_idx = new int[] {3 /* DPROS */, 1 /* AGE */, 6 /* VOL */}; parms._init = GLRM.Initialization.PlusPlus; parms._min_step_size = 1e-5; parms._recover_svd = false; parms._max_iterations = 2000; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); GLRMTest.checkLossbyCol(parms, model); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
@Test public void testLosses() throws InterruptedException, ExecutionException { long seed = 0xDECAF; Random rng = new Random(seed); Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS final GLRMParameters.Regularizer[] regs = new GLRMParameters.Regularizer[] { GLRMParameters.Regularizer.Quadratic, GLRMParameters.Regularizer.L1, GLRMParameters.Regularizer.NonNegative, GLRMParameters.Regularizer.OneSparse, GLRMParameters.Regularizer.UnitOneSparse, GLRMParameters.Regularizer.Simplex }; Scope.enter(); try { train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); for (GLRMParameters.Loss loss : new GLRMParameters.Loss[] { GLRMParameters.Loss.Quadratic, GLRMParameters.Loss.Absolute, GLRMParameters.Loss.Huber, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Hinge, GLRMParameters.Loss.Logistic }) { for (GLRMParameters.Loss multiloss : new GLRMParameters.Loss[] { GLRMParameters.Loss.Categorical, GLRMParameters.Loss.Ordinal }) { GLRMModel model = null; try { Scope.enter(); long myseed = rng.nextLong(); Log.info("GLRM using seed = " + myseed); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._transform = DataInfo.TransformType.NONE; parms._k = 5; parms._loss = loss; parms._multi_loss = multiloss; parms._init = GLRM.Initialization.SVD; parms._regularization_x = regs[rng.nextInt(regs.length)]; parms._regularization_y = regs[rng.nextInt(regs.length)]; parms._gamma_x = Math.abs(rng.nextDouble()); parms._gamma_y = Math.abs(rng.nextDouble()); parms._recover_svd = false; parms._seed = myseed; parms._verbose = false; parms._max_iterations = 500; GLRM job = new GLRM(parms); try { model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { throw t; } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (model != null) model.delete(); Scope.exit(); } } } } finally { if (train != null) train.delete(); Scope.exit(); } }
@Test @Ignore public void run() { Scope.enter(); try { File file = find_test_file("bigdata/laptop/mnist/train.csv.gz"); File valid = find_test_file("bigdata/laptop/mnist/test.csv.gz"); if (file != null) { NFSFileVec trainfv = NFSFileVec.make(file); Frame frame = ParseDataset.parse(Key.make(), trainfv._key); NFSFileVec validfv = NFSFileVec.make(valid); Frame vframe = ParseDataset.parse(Key.make(), validfv._key); DeepLearningParameters p = new DeepLearningParameters(); // populate model parameters p._model_id = Key.make("dl_mnist_model"); p._train = frame._key; // p._valid = vframe._key; p._response_column = "C785"; // last column is the response p._activation = DeepLearningParameters.Activation.RectifierWithDropout; // p._activation = DeepLearningParameters.Activation.MaxoutWithDropout; p._hidden = new int[] {800, 800}; p._input_dropout_ratio = 0.2; p._mini_batch_size = 1; p._train_samples_per_iteration = 50000; p._score_duty_cycle = 0; // p._shuffle_training_data = true; // p._l1= 1e-5; // p._max_w2= 10; p._epochs = 10 * 5. / 6; // Convert response 'C785' to categorical (digits 1 to 10) int ci = frame.find("C785"); Scope.track(frame.replace(ci, frame.vecs()[ci].toEnum())._key); Scope.track(vframe.replace(ci, vframe.vecs()[ci].toEnum())._key); DKV.put(frame); DKV.put(vframe); // speed up training p._adaptive_rate = true; // disable adaptive per-weight learning rate -> default settings for learning rate // and momentum are probably not ideal (slow convergence) p._replicate_training_data = true; // avoid extra communication cost upfront, got enough data on each node for load // balancing p._overwrite_with_best_model = true; // no need to keep the best model around p._classification_stop = -1; p._score_interval = 60; // score and print progress report (only) every 20 seconds p._score_training_samples = 10000; // only score on a small sample of the training set -> don't want to spend too // much time scoring (note: there will be at least 1 row per chunk) DeepLearning dl = new DeepLearning(p); DeepLearningModel model = null; try { model = dl.trainModel().get(); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { dl.remove(); if (model != null) { model.delete(); } } } else { Log.info("Please run ./gradlew syncBigDataLaptop in the top-level directory of h2o-3."); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { Scope.exit(); } }