Пример #1
0
  @Test
  public void testCategoricalProstate() throws InterruptedException, ExecutionException {
    GLRM job = null;
    GLRMModel model = null;
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    try {
      Scope.enter();
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      GLRMParameters parms = new GLRMParameters();
      parms._train = train._key;
      parms._k = 8;
      parms._gamma_x = parms._gamma_y = 0.1;
      parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic;
      parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic;
      parms._init = GLRM.Initialization.PlusPlus;
      parms._transform = DataInfo.TransformType.STANDARDIZE;
      parms._recover_svd = false;
      parms._max_iterations = 200;

      try {
        job = new GLRM(parms);
        model = job.trainModel().get();
        Log.info(
            "Iteration "
                + model._output._iterations
                + ": Objective value = "
                + model._output._objective);
        model.score(train).delete();
        ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
        Log.info(
            "Numeric Sum of Squared Error = "
                + mm._numerr
                + "\tCategorical Misclassification Error = "
                + mm._caterr);
      } catch (Throwable t) {
        t.printStackTrace();
        throw new RuntimeException(t);
      } finally {
        job.remove();
      }
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (train != null) train.delete();
      if (model != null) model.delete();
      Scope.exit();
    }
  }
Пример #2
0
  @Test
  public void testGroupbyTableSpeed() {
    Frame ids = parse_test_file(Key.make("cov"), "smalldata/junit/id_cols.csv");
    ids.replace(0, ids.anyVec().toCategoricalVec()).remove();
    System.out.println(ids.toString(0, 10));

    long start = System.currentTimeMillis();
    Val v_gb = Exec.exec("(GB cov [0] nrow 0 \"all\")");
    System.out.println("GB Time= " + (System.currentTimeMillis() - start) + "msec");
    System.out.println(v_gb.toString());
    ((ValFrame) v_gb)._fr.delete();

    long start2 = System.currentTimeMillis();
    Val v_tb = Exec.exec("(table cov FALSE)");
    System.out.println("Table Time= " + (System.currentTimeMillis() - start2) + "msec");
    System.out.println(v_tb.toString());
    ((ValFrame) v_tb)._fr.delete();

    ids.delete();
  }
  /**
   * This method trains a stacked autoencoder
   *
   * @param trainData Training dataset as a JavaRDD
   * @param batchSize Size of a training mini-batch
   * @param layerSizes Number of neurons for each layer
   * @param epochs Number of epochs to train
   * @param responseColumn Name of the response column
   * @param modelName Name of the model
   * @return DeepLearningModel
   */
  public DeepLearningModel train(
      JavaRDD<LabeledPoint> trainData,
      int batchSize,
      int[] layerSizes,
      String activationType,
      int epochs,
      String responseColumn,
      String modelName,
      MLModel mlModel,
      long modelID) {
    // build stacked autoencoder by training the model with training data

    double trainingFraction = 1;
    try {
      Scope.enter();
      if (trainData != null) {

        int numberOfFeatures = mlModel.getFeatures().size();
        List<Feature> features = mlModel.getFeatures();
        String[] names = new String[numberOfFeatures + 1];
        for (int i = 0; i < numberOfFeatures; i++) {
          names[i] = features.get(i).getName();
        }
        names[numberOfFeatures] = mlModel.getResponseVariable();

        Frame frame = DeeplearningModelUtils.javaRDDToFrame(names, trainData);

        // H2O uses default C<x> for column header
        // String classifColName = "C" + frame.numCols();
        String classifColName = mlModel.getResponseVariable();

        // Convert response to categorical (digits 1 to <num of columns>)
        int ci = frame.find(classifColName);
        Scope.track(frame.replace(ci, frame.vecs()[ci].toEnum())._key);

        // Splitting train file to train, validation and test
        // Using FrameSplitter (instead of SuffleSplitFrame) gives a weird exception
        // barrier onExCompletion for hex.deeplearning.DeepLearning$DeepLearningDriver@78ec854
        double[] ratios = new double[] {trainingFraction, 1 - trainingFraction};
        @SuppressWarnings("unchecked")
        Frame[] splits =
            ShuffleSplitFrame.shuffleSplitFrame(
                frame, generateNumKeys(frame._key, ratios.length), ratios, 123456789);

        Frame trainFrame = splits[0];
        Frame vframe = splits[1];

        if (log.isDebugEnabled()) {
          log.debug("Creating Deeplearning parameters");
        }

        DeepLearningParameters deeplearningParameters = new DeepLearningParameters();

        // convert model name
        String dlModelName = modelName.replace('.', '_').replace('-', '_');

        // populate model parameters
        deeplearningParameters._model_id = Key.make(dlModelName + "_dl");
        deeplearningParameters._train = trainFrame._key;
        deeplearningParameters._valid = vframe._key;
        deeplearningParameters._response_column = classifColName; // last column is the response
        // This is causin all the predictions to be 0.0
        // p._autoencoder = true;
        deeplearningParameters._activation = getActivationType(activationType);
        deeplearningParameters._hidden = layerSizes;
        deeplearningParameters._train_samples_per_iteration = batchSize;
        deeplearningParameters._input_dropout_ratio = 0.2;
        deeplearningParameters._l1 = 1e-5;
        deeplearningParameters._max_w2 = 10;
        deeplearningParameters._epochs = epochs;

        // speed up training
        deeplearningParameters._adaptive_rate =
            true; // disable adaptive per-weight learning rate -> default
        // settings for learning rate and momentum are probably
        // not ideal (slow convergence)
        deeplearningParameters._replicate_training_data =
            true; // avoid extra communication cost upfront, got
        // enough data on each node for load balancing
        deeplearningParameters._overwrite_with_best_model =
            true; // no need to keep the best model around
        deeplearningParameters._diagnostics =
            false; // no need to compute statistics during training
        deeplearningParameters._classification_stop = -1;
        deeplearningParameters._score_interval =
            60; // score and print progress report (only) every 20 seconds
        deeplearningParameters._score_training_samples =
            batchSize / 10; // only score on a small sample of the
        // training set -> don't want to spend
        // too much time scoring (note: there
        // will be at least 1 row per chunk)

        DKV.put(trainFrame);
        DKV.put(vframe);

        deeplearning = new DeepLearning(deeplearningParameters);

        if (log.isDebugEnabled()) {
          log.debug("Start training deeplearning model ....");
        }

        try {
          dlModel = deeplearning.trainModel().get();
          if (log.isDebugEnabled()) {
            log.debug("Successfully finished Training deeplearning model.");
          }

        } catch (RuntimeException ex) {
          log.error("Error in training Stacked Autoencoder classifier model", ex);
        }
      } else {
        log.error("Train file not found!");
      }
    } catch (RuntimeException ex) {
      log.error("Failed to train the deeplearning model [id] " + modelID + ". " + ex.getMessage());
    } finally {
      Scope.exit();
    }

    return dlModel;
  }
Пример #4
0
 public Vec setWeights(String name, Vec vec) {
   if (_weights) return _adaptedFrame.replace(weightChunkId(), vec);
   _adaptedFrame.insertVec(weightChunkId(), name, vec);
   _weights = true;
   return null;
 }
Пример #5
0
  @Test
  public void testExpandCatsProstate() throws InterruptedException, ExecutionException {
    double[][] prostate =
        ard(
            ard(0, 71, 1, 0, 0, 4.8, 14.0, 7),
            ard(1, 70, 1, 1, 0, 8.4, 21.8, 5),
            ard(0, 73, 1, 3, 0, 10.0, 27.4, 6),
            ard(1, 68, 1, 0, 0, 6.7, 16.7, 6));
    double[][] pros_expandR =
        ard(
            ard(1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 71, 4.8, 14.0, 7),
            ard(0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 70, 8.4, 21.8, 5),
            ard(0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 73, 10.0, 27.4, 6),
            ard(1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 68, 6.7, 16.7, 6));
    String[] pros_cols =
        new String[] {"Capsule", "Age", "Race", "Dpros", "Dcaps", "PSA", "Vol", "Gleason"};
    String[][] pros_domains =
        new String[][] {
          new String[] {"No", "Yes"},
          null,
          new String[] {"Other", "White", "Black"},
          new String[] {"None", "UniLeft", "UniRight", "Bilobar"},
          new String[] {"No", "Yes"},
          null,
          null,
          null
        };
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    Frame fr = null;
    try {
      Scope.enter();
      fr = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(fr.replace(cats[i], fr.vec(cats[i]).toCategoricalVec())._key);
      fr.remove("ID").remove();
      DKV.put(fr._key, fr);
      DataInfo dinfo =
          new DataInfo(
              Key.make(),
              fr,
              null,
              0,
              true,
              DataInfo.TransformType.NONE,
              DataInfo.TransformType.NONE,
              false,
              false,
              false, /* weights */
              false, /* offset */
              false, /* fold */
              false);

      Log.info("Original matrix:\n" + colFormat(pros_cols, "%8.7s") + ArrayUtils.pprint(prostate));
      double[][] pros_perm = ArrayUtils.permuteCols(prostate, dinfo._permutation);
      Log.info(
          "Permuted matrix:\n"
              + colFormat(pros_cols, "%8.7s", dinfo._permutation)
              + ArrayUtils.pprint(pros_perm));

      double[][] pros_exp = GLRM.expandCats(pros_perm, dinfo);
      Log.info(
          "Expanded matrix:\n"
              + colExpFormat(pros_cols, pros_domains, "%8.7s", dinfo._permutation)
              + ArrayUtils.pprint(pros_exp));
      Assert.assertArrayEquals(pros_expandR, pros_exp);
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (fr != null) fr.delete();
      Scope.exit();
    }
  }
Пример #6
0
  @Test
  public void testSetColumnLossCats() throws InterruptedException, ExecutionException {
    GLRM job = null;
    GLRMModel model = null;
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    Scope.enter();
    try {
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      GLRMParameters parms = new GLRMParameters();
      parms._train = train._key;
      parms._k = 12;
      parms._loss = GLRMParameters.Loss.Quadratic;
      parms._multi_loss = GLRMParameters.Loss.Categorical;
      parms._loss_by_col =
          new GLRMParameters.Loss[] {
            GLRMParameters.Loss.Ordinal, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Absolute
          };
      parms._loss_by_col_idx = new int[] {3 /* DPROS */, 1 /* AGE */, 6 /* VOL */};
      parms._init = GLRM.Initialization.PlusPlus;
      parms._min_step_size = 1e-5;
      parms._recover_svd = false;
      parms._max_iterations = 2000;

      try {
        job = new GLRM(parms);
        model = job.trainModel().get();
        Log.info(
            "Iteration "
                + model._output._iterations
                + ": Objective value = "
                + model._output._objective);
        GLRMTest.checkLossbyCol(parms, model);

        model.score(train).delete();
        ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
        Log.info(
            "Numeric Sum of Squared Error = "
                + mm._numerr
                + "\tCategorical Misclassification Error = "
                + mm._caterr);
      } catch (Throwable t) {
        t.printStackTrace();
        throw new RuntimeException(t);
      } finally {
        job.remove();
      }

    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (train != null) train.delete();
      if (model != null) model.delete();
      Scope.exit();
    }
  }
Пример #7
0
  @Test
  public void testLosses() throws InterruptedException, ExecutionException {
    long seed = 0xDECAF;
    Random rng = new Random(seed);
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS
    final GLRMParameters.Regularizer[] regs =
        new GLRMParameters.Regularizer[] {
          GLRMParameters.Regularizer.Quadratic,
          GLRMParameters.Regularizer.L1,
          GLRMParameters.Regularizer.NonNegative,
          GLRMParameters.Regularizer.OneSparse,
          GLRMParameters.Regularizer.UnitOneSparse,
          GLRMParameters.Regularizer.Simplex
        };

    Scope.enter();
    try {
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      for (GLRMParameters.Loss loss :
          new GLRMParameters.Loss[] {
            GLRMParameters.Loss.Quadratic,
            GLRMParameters.Loss.Absolute,
            GLRMParameters.Loss.Huber,
            GLRMParameters.Loss.Poisson,
            GLRMParameters.Loss.Hinge,
            GLRMParameters.Loss.Logistic
          }) {
        for (GLRMParameters.Loss multiloss :
            new GLRMParameters.Loss[] {
              GLRMParameters.Loss.Categorical, GLRMParameters.Loss.Ordinal
            }) {
          GLRMModel model = null;
          try {
            Scope.enter();
            long myseed = rng.nextLong();
            Log.info("GLRM using seed = " + myseed);

            GLRMParameters parms = new GLRMParameters();
            parms._train = train._key;
            parms._transform = DataInfo.TransformType.NONE;
            parms._k = 5;
            parms._loss = loss;
            parms._multi_loss = multiloss;
            parms._init = GLRM.Initialization.SVD;
            parms._regularization_x = regs[rng.nextInt(regs.length)];
            parms._regularization_y = regs[rng.nextInt(regs.length)];
            parms._gamma_x = Math.abs(rng.nextDouble());
            parms._gamma_y = Math.abs(rng.nextDouble());
            parms._recover_svd = false;
            parms._seed = myseed;
            parms._verbose = false;
            parms._max_iterations = 500;

            GLRM job = new GLRM(parms);
            try {
              model = job.trainModel().get();
              Log.info(
                  "Iteration "
                      + model._output._iterations
                      + ": Objective value = "
                      + model._output._objective);
              model.score(train).delete();
              ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
              Log.info(
                  "Numeric Sum of Squared Error = "
                      + mm._numerr
                      + "\tCategorical Misclassification Error = "
                      + mm._caterr);
            } catch (Throwable t) {
              throw t;
            } finally {
              job.remove();
            }
          } catch (Throwable t) {
            t.printStackTrace();
            throw new RuntimeException(t);
          } finally {
            if (model != null) model.delete();
            Scope.exit();
          }
        }
      }
    } finally {
      if (train != null) train.delete();
      Scope.exit();
    }
  }
Пример #8
0
  @Test
  @Ignore
  public void run() {
    Scope.enter();
    try {
      File file = find_test_file("bigdata/laptop/mnist/train.csv.gz");
      File valid = find_test_file("bigdata/laptop/mnist/test.csv.gz");
      if (file != null) {
        NFSFileVec trainfv = NFSFileVec.make(file);
        Frame frame = ParseDataset.parse(Key.make(), trainfv._key);
        NFSFileVec validfv = NFSFileVec.make(valid);
        Frame vframe = ParseDataset.parse(Key.make(), validfv._key);
        DeepLearningParameters p = new DeepLearningParameters();

        // populate model parameters
        p._model_id = Key.make("dl_mnist_model");
        p._train = frame._key;
        //        p._valid = vframe._key;
        p._response_column = "C785"; // last column is the response
        p._activation = DeepLearningParameters.Activation.RectifierWithDropout;
        //        p._activation = DeepLearningParameters.Activation.MaxoutWithDropout;
        p._hidden = new int[] {800, 800};
        p._input_dropout_ratio = 0.2;
        p._mini_batch_size = 1;
        p._train_samples_per_iteration = 50000;
        p._score_duty_cycle = 0;
        //        p._shuffle_training_data = true;
        //        p._l1= 1e-5;
        //        p._max_w2= 10;
        p._epochs = 10 * 5. / 6;

        // Convert response 'C785' to categorical (digits 1 to 10)
        int ci = frame.find("C785");
        Scope.track(frame.replace(ci, frame.vecs()[ci].toEnum())._key);
        Scope.track(vframe.replace(ci, vframe.vecs()[ci].toEnum())._key);
        DKV.put(frame);
        DKV.put(vframe);

        // speed up training
        p._adaptive_rate =
            true; // disable adaptive per-weight learning rate -> default settings for learning rate
                  // and momentum are probably not ideal (slow convergence)
        p._replicate_training_data =
            true; // avoid extra communication cost upfront, got enough data on each node for load
                  // balancing
        p._overwrite_with_best_model = true; // no need to keep the best model around
        p._classification_stop = -1;
        p._score_interval = 60; // score and print progress report (only) every 20 seconds
        p._score_training_samples =
            10000; // only score on a small sample of the training set -> don't want to spend too
                   // much time scoring (note: there will be at least 1 row per chunk)

        DeepLearning dl = new DeepLearning(p);
        DeepLearningModel model = null;
        try {
          model = dl.trainModel().get();
        } catch (Throwable t) {
          t.printStackTrace();
          throw new RuntimeException(t);
        } finally {
          dl.remove();
          if (model != null) {
            model.delete();
          }
        }
      } else {
        Log.info("Please run ./gradlew syncBigDataLaptop in the top-level directory of h2o-3.");
      }
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      Scope.exit();
    }
  }