Пример #1
0
 @Test
 public void testDomains() {
   Frame frame = parse_test_file("smalldata/junit/weather.csv");
   for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) {
     Vec v = frame.vec(s);
     Vec newV = v.toCategoricalVec();
     frame.remove(s);
     frame.add(s, newV);
     v.remove();
   }
   DKV.put(frame);
   AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
   parms._train = frame._key;
   parms._radius_scale = 10;
   AggregatorModel agg = new Aggregator(parms).trainModel().get();
   Frame output = agg._output._output_frame.get();
   Assert.assertTrue(output.numRows() < 0.5 * frame.numRows());
   boolean same = true;
   for (int i = 0; i < frame.numCols(); ++i) {
     if (frame.vec(i).isCategorical()) {
       same = (frame.domains()[i].length == output.domains()[i].length);
       if (!same) break;
     }
   }
   frame.remove();
   output.remove();
   agg.remove();
   Assert.assertFalse(same);
 }
Пример #2
0
  @Test
  public void testAggregatorBinary() {
    CreateFrame cf = new CreateFrame();
    cf.rows = 1000;
    cf.cols = 10;
    cf.categorical_fraction = 0.6;
    cf.integer_fraction = 0.0;
    cf.binary_fraction = 0.0;
    cf.real_range = 100;
    cf.integer_range = 100;
    cf.missing_fraction = 0.1;
    cf.factors = 5;
    cf.seed = 1234;
    Frame frame = cf.execImpl().get();

    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._radius_scale = 1.0;
    parms._transform = DataInfo.TransformType.NORMALIZE;
    parms._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.Binary;
    long start = System.currentTimeMillis();
    AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.905
    System.out.println(
        "AggregatorModel finished in: "
            + (System.currentTimeMillis() - start) / 1000.
            + " seconds");
    agg.checkConsistency();
    Frame output = agg._output._output_frame.get();
    System.out.println(output.toTwoDimTable(0, 10));
    Log.info("Number of exemplars: " + agg._exemplars.length);
    //    Assert.assertTrue(agg._exemplars.length==649);
    output.remove();
    frame.remove();
    agg.remove();
  }
Пример #3
0
  // @Ignore("PUBDEV-1643")
  @Test
  public void testDuplicatesCarsGrid() {
    Grid grid = null;
    Frame fr = null;
    Vec old = null;
    try {
      fr = parse_test_file("smalldata/junit/cars_20mpg.csv");
      fr.remove("name").remove(); // Remove unique id
      old = fr.remove("economy");
      fr.add("economy", old); // response to last column
      DKV.put(fr);

      // Setup random hyperparameter search space
      HashMap<String, Object[]> hyperParms =
          new HashMap<String, Object[]>() {
            {
              put("_ntrees", new Integer[] {5, 5});
              put("_max_depth", new Integer[] {2, 2});
              put("_mtries", new Integer[] {-1, -1});
              put("_sample_rate", new Double[] {.1, .1});
            }
          };

      // Fire off a grid search
      DRFModel.DRFParameters params = new DRFModel.DRFParameters();
      params._train = fr._key;
      params._response_column = "economy";

      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = gs.get();

      // Check that duplicate model have not been constructed
      Model[] models = grid.getModels();
      assertTrue("Number of returned models has to be > 0", models.length > 0);
      // But all off them should be same
      Key<Model> modelKey = models[0]._key;
      for (Model m : models) {
        assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key);
      }
    } finally {
      if (old != null) {
        old.remove();
      }
      if (fr != null) {
        fr.remove();
      }
      if (grid != null) {
        grid.remove();
      }
    }
  }
Пример #4
0
  @Test
  public void testChunks() {
    Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");

    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._radius_scale = 3.0;
    long start = System.currentTimeMillis();
    AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.418
    System.out.println(
        "AggregatorModel finished in: "
            + (System.currentTimeMillis() - start) / 1000.
            + " seconds");
    agg.checkConsistency();
    Frame output = agg._output._output_frame.get();
    Log.info("Number of exemplars: " + agg._exemplars.length);
    //    Assert.assertTrue(agg._exemplars.length==1993);
    output.remove();
    agg.remove();

    for (int i : new int[] {1, 2, 5, 10, 50, 100}) {
      Key key = Key.make();
      RebalanceDataSet rb = new RebalanceDataSet(frame, key, i);
      H2O.submitTask(rb);
      rb.join();
      Frame rebalanced = DKV.get(key).get();

      parms = new AggregatorModel.AggregatorParameters();
      parms._train = frame._key;
      parms._radius_scale = 3.0;
      start = System.currentTimeMillis();
      AggregatorModel agg2 =
          new Aggregator(parms).trainModel().get(); // 0.373 0.504 0.357 0.454 0.368 0.355
      System.out.println(
          "AggregatorModel finished in: "
              + (System.currentTimeMillis() - start) / 1000.
              + " seconds");
      agg2.checkConsistency();
      Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length);
      rebalanced.delete();
      Assert.assertTrue(
          Math.abs(agg._exemplars.length - agg2._exemplars.length)
              == 0); // < agg._exemplars.length*0);
      output = agg2._output._output_frame.get();
      output.remove();
      agg2.remove();
    }
    frame.delete();
  }
Пример #5
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Val v = stk.track(asts[1].exec(env));
    if (v instanceof ValRow) {
      ValRow vv = (ValRow) v;
      return vv.slice(asts[2].columns(vv._names));
    }
    Frame fr = v.getFrame();
    int[] cols = asts[2].columns(fr.names());

    Frame fr2 = new Frame();
    if (cols.length == 0) { // Empty inclusion list?
    } else if (cols[0] >= 0) { // Positive (inclusion) list
      if (cols[cols.length - 1] > fr.numCols())
        throw new IllegalArgumentException(
            "Column must be an integer from 0 to " + (fr.numCols() - 1));
      for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]);
    } else { // Negative (exclusion) list
      fr2 = new Frame(fr); // All of them at first
      Arrays.sort(cols); // This loop depends on the values in sorted order
      for (int col : cols)
        if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column
    }

    return new ValFrame(fr2);
  }
Пример #6
0
 public void dropInteractions() { // only called to cleanup the InteractionWrappedVecs!
   if (_interactions != null) {
     Vec[] vecs = _adaptedFrame.remove(_interactionVecs);
     for (Vec v : vecs) v.remove();
     _interactions = null;
   }
 }
Пример #7
0
  // Adapt a trained model to a test dataset with different enums
  /*@Test*/ public void testModelAdapt() {
    File file1 = TestUtil.find_test_file("./smalldata/kaggle/KDDTrain.arff.gz");
    Key fkey1 = NFSFileVec.make(file1);
    Key dest1 = Key.make("KDDTrain.hex");
    File file2 = TestUtil.find_test_file("./smalldata/kaggle/KDDTest.arff.gz");
    Key fkey2 = NFSFileVec.make(file2);
    Key dest2 = Key.make("KDDTest.hex");
    GBM gbm = null;
    Frame fr = null;
    try {
      gbm = new GBM();
      gbm.source = ParseDataset2.parse(dest1, new Key[] {fkey1});
      UKV.remove(fkey1);
      gbm.response = gbm.source.remove(41); // Response is col 41
      gbm.ntrees = 2;
      gbm.max_depth = 8;
      gbm.learn_rate = 0.2f;
      gbm.min_rows = 10;
      gbm.nbins = 50;
      gbm.invoke();

      // The test data set has a few more enums than the train
      Frame ftest = ParseDataset2.parse(dest2, new Key[] {fkey2});
      Frame preds = gbm.score(ftest);

    } finally {
      UKV.remove(dest1); // Remove original hex frame key
      if (gbm != null) {
        UKV.remove(gbm.dest()); // Remove the model
        UKV.remove(gbm.response._key);
        gbm.remove(); // Remove GBM Job
        if (fr != null) fr.remove();
      }
    }
  }
Пример #8
0
  // ==========================================================================
  public void basicGBM(String fname, String hexname, PrepData prep) {
    File file = TestUtil.find_test_file(fname);
    if (file == null) return; // Silently abort test if the file is missing
    Key fkey = NFSFileVec.make(file);
    Key dest = Key.make(hexname);
    GBM gbm = null;
    Frame fr = null;
    try {
      gbm = new GBM();
      gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey});
      UKV.remove(fkey);
      int idx = prep.prep(fr);
      if (idx < 0) {
        gbm.classification = false;
        idx = ~idx;
      }
      String rname = fr._names[idx];
      gbm.response = fr.vecs()[idx];
      fr.remove(idx); // Move response to the end
      fr.add(rname, gbm.response);
      gbm.ntrees = 4;
      gbm.max_depth = 4;
      gbm.min_rows = 1;
      gbm.nbins = 50;
      gbm.cols = new int[fr.numCols()];
      for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i;
      gbm.learn_rate = .2f;
      gbm.invoke();

      fr = gbm.score(gbm.source);

      GBM.GBMModel gbmmodel = UKV.get(gbm.dest());
      // System.out.println(gbmmodel.toJava());

    } finally {
      UKV.remove(dest); // Remove original hex frame key
      if (gbm != null) {
        UKV.remove(gbm.dest()); // Remove the model
        UKV.remove(gbm.response._key);
        gbm.remove(); // Remove GBM Job
        if (fr != null) fr.remove();
      }
    }
  }
Пример #9
0
  @Test
  public void testCategoricalProstate() throws InterruptedException, ExecutionException {
    GLRM job = null;
    GLRMModel model = null;
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    try {
      Scope.enter();
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      GLRMParameters parms = new GLRMParameters();
      parms._train = train._key;
      parms._k = 8;
      parms._gamma_x = parms._gamma_y = 0.1;
      parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic;
      parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic;
      parms._init = GLRM.Initialization.PlusPlus;
      parms._transform = DataInfo.TransformType.STANDARDIZE;
      parms._recover_svd = false;
      parms._max_iterations = 200;

      try {
        job = new GLRM(parms);
        model = job.trainModel().get();
        Log.info(
            "Iteration "
                + model._output._iterations
                + ": Objective value = "
                + model._output._objective);
        model.score(train).delete();
        ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
        Log.info(
            "Numeric Sum of Squared Error = "
                + mm._numerr
                + "\tCategorical Misclassification Error = "
                + mm._caterr);
      } catch (Throwable t) {
        t.printStackTrace();
        throw new RuntimeException(t);
      } finally {
        job.remove();
      }
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (train != null) train.delete();
      if (model != null) model.delete();
      Scope.exit();
    }
  }
Пример #10
0
  @Test
  public void testCollisionOfDRFParamsChecksum() {
    Frame fr = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove();
      Vec old = fr.remove("economy (mpg)");
      fr.add("economy (mpg)", old); // response to last column
      DKV.put(fr);

      // {"_model_id":null,"_train":{"name":"_83da9e0754c5eb9f6b812fe17e7945e5","type":"Key"},"_valid":null,"_nfolds":0,"_keep_cross_validation_predictions":false,"_fold_assignment":"AUTO","_distribution":"AUTO","_tweedie_power":1.5,"_ignored_columns":null,"_ignore_const_cols":true,"_weights_column":null,"_offset_column":null,"_fold_column":null,"_score_each_iteration":false,"_response_column":"economy (mpg)","_balance_classes":false,"_max_after_balance_size":5.0,"_class_sampling_factors":null,"_max_hit_ratio_k":10,"_max_confusion_matrix_size":20,"_checkpoint":null,"_ntrees":9,"_max_depth":15,"_min_rows":1.0,"_nbins":20,"_nbins_cats":1024,"_r2_stopping":0.999999,"_seed":-4522296119273841674,"_nbins_top_level":1024,"_build_tree_one_node":false,"_initial_score_interval":4000,"_score_interval":4000,"_mtries":3,"_sample_rate":0.6499997,"_binomial_double_trees":false}
      DRFModel.DRFParameters params1 = new DRFModel.DRFParameters();
      params1._train = fr._key;
      params1._response_column = "economy (mpg)";
      params1._seed = -4522296119273841674L;
      params1._mtries = 3;
      params1._max_depth = 15;
      params1._ntrees = 9;
      params1._sample_rate = 0.6499997f;

      // {"_model_id":null,"_train":{"name":"_83da9e0754c5eb9f6b812fe17e7945e5","type":"Key"},"_valid":null,"_nfolds":0,"_keep_cross_validation_predictions":false,"_fold_assignment":"AUTO","_distribution":"AUTO","_tweedie_power":1.5,"_ignored_columns":null,"_ignore_const_cols":true,"_weights_column":null,"_offset_column":null,"_fold_column":null,"_score_each_iteration":false,"_response_column":"economy (mpg)","_balance_classes":false,"_max_after_balance_size":5.0,"_class_sampling_factors":null,"_max_hit_ratio_k":10,"_max_confusion_matrix_size":20,"_checkpoint":null,"_ntrees":13,"_max_depth":1,"_min_rows":1.0,"_nbins":20,"_nbins_cats":1024,"_r2_stopping":0.999999,"_seed":-4522296119273841674,"_nbins_top_level":1024,"_build_tree_one_node":false,"_initial_score_interval":4000,"_score_interval":4000,"_mtries":1,"_sample_rate":0.6499997,"_binomial_double_trees":false}
      DRFModel.DRFParameters params2 = new DRFModel.DRFParameters();
      params2._train = fr._key;
      params2._response_column = "economy (mpg)";
      params2._seed = -4522296119273841674L;
      params2._mtries = 1;
      params2._max_depth = 1;
      params2._ntrees = 13;
      params2._sample_rate = 0.6499997f;
      long csum1 = params1.checksum();
      long csum2 = params2.checksum();
      Assert.assertNotEquals("Checksums shoudl be different", csum1, csum2);
    } finally {
      if (fr != null) {
        fr.remove();
      }
    }
  }
Пример #11
0
 private void checkTree(String tree, boolean expectThrow) {
   // Frame r = frame(new double[][]{{-1},{1},{2},{3},{4},{5},{6},{254}});
   // Key ahex = Key.make("a.hex");
   // Frame fr = new Frame(ahex, null, new Vec[]{r.remove(0)});
   // r.delete();
   // DKV.put(ahex, fr);
   Frame fr = parse_test_file(Key.make("a.hex"), "smalldata/iris/iris_wheader.csv");
   fr.remove(4).remove();
   try {
     Val val = Exec.exec(tree);
     Assert.assertFalse(expectThrow);
     System.out.println(val.toString());
     if (val instanceof ValFrame) {
       Frame fr2 = ((ValFrame) val)._fr;
       System.out.println(fr2.vec(0));
       fr2.remove();
     }
   } catch (IllegalArgumentException iae) {
     if (!expectThrow) throw iae;
   } finally {
     fr.delete();
   }
 }
Пример #12
0
    private void applyTrainingFrameSideEffects() {
      int numCols = _modelBuilderTrain.numCols();
      String responseVecName = _modelBuilderTrain.names()[numCols - 1];
      Vec responseVec = _modelBuilderTrain.remove(numCols - 1);

      final boolean use_weights_column = (_parms.weights_column != null);
      final boolean use_start_column = (_parms.start_column != null);

      if (use_weights_column) {
        Vec weightsVec = _parms.weights_column;
        int idxInRawFrame = _train.find(weightsVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find weightVec");
        }

        String weightsVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(weightsVecName, weightsVec);
      }

      if (use_start_column) {
        Vec startVec = _parms.start_column;
        int idxInRawFrame = _train.find(startVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find startVec");
        }

        String startVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(startVecName, startVec);
      }

      {
        Vec stopVec = _parms.stop_column;
        int idxInRawFrame = _train.find(stopVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find stopVec");
        }

        String stopVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(stopVecName, stopVec);
      }

      _modelBuilderTrain.add(responseVecName, responseVec);
    }
Пример #13
0
  @Test
  public void testCovtype() {
    Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");

    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._radius_scale = 5.0;
    long start = System.currentTimeMillis();
    AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.179
    System.out.println(
        "AggregatorModel finished in: "
            + (System.currentTimeMillis() - start) / 1000.
            + " seconds");
    agg.checkConsistency();
    frame.delete();
    Frame output = agg._output._output_frame.get();
    Log.info("Exemplars: " + output.toString());
    output.remove();
    Log.info("Number of exemplars: " + agg._exemplars.length);
    //    Assert.assertTrue(agg._exemplars.length==615);
    agg.remove();
  }
Пример #14
0
  @Ignore
  @Test
  public void testMNIST() {
    Frame frame = parse_test_file("bigdata/laptop/mnist/train.csv.gz");

    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._radius_scale = 100.0;
    long start = System.currentTimeMillis();
    AggregatorModel agg = new Aggregator(parms).trainModel().get();
    System.out.println(
        "AggregatorModel finished in: "
            + (System.currentTimeMillis() - start) / 1000.
            + " seconds");
    agg.checkConsistency();
    frame.delete();
    Frame output = agg._output._output_frame.get();
    //    Log.info("Exemplars: " + output);
    output.remove();
    Log.info("Number of exemplars: " + agg._exemplars.length);
    agg.remove();
  }
Пример #15
0
 /**
  * Compute the L2 norm for each row of the frame
  *
  * @param fr Input frame
  * @return Vec containing L2 values for each row, is in K-V store
  */
 public static Vec getL2(final Frame fr, final double[] scale) {
   // add workspace vec at end
   final int idx = fr.numCols();
   assert (scale.length == idx) : "Mismatch for number of columns";
   fr.add("L2", fr.anyVec().makeZero());
   Vec res;
   try {
     new MRTask2() {
       @Override
       public void map(Chunk[] cs) {
         for (int r = 0; r < cs[0]._len; r++) {
           double norm2 = 0;
           for (int i = 0; i < idx; i++) norm2 += Math.pow(cs[i].at0(r) * scale[i], 2);
           cs[idx].set0(r, Math.sqrt(norm2));
         }
       }
     }.doAll(fr);
   } finally {
     res = fr.remove(idx);
   }
   res.rollupStats();
   return res;
 }
Пример #16
0
    private void applyScoringFrameSideEffects() {
      final int offset_ncol = _parms.offset_columns == null ? 0 : _parms.offset_columns.length;
      if (offset_ncol == 0) {
        return;
      }

      int numCols = _modelBuilderTrain.numCols();
      String responseVecName = _modelBuilderTrain.names()[numCols - 1];
      Vec responseVec = _modelBuilderTrain.remove(numCols - 1);

      for (int i = 0; i < offset_ncol; i++) {
        Vec offsetVec = _parms.offset_columns[i];
        int idxInRawFrame = _train.find(offsetVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find offsetVec");
        }

        String offsetVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(offsetVecName, offsetVec);
      }

      _modelBuilderTrain.add(responseVecName, responseVec);
    }
Пример #17
0
  @Ignore
  @Test
  public void testCovtypeMemberIndices() {
    Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");

    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._radius_scale = 5.0;
    long start = System.currentTimeMillis();
    AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 1.489
    System.out.println(
        "AggregatorModel finished in: "
            + (System.currentTimeMillis() - start) / 1000.
            + " seconds");
    agg.checkConsistency();

    //    Frame assignment = new Frame(new Vec[]{(Vec)agg._exemplar_assignment_vec_key.get()});
    //    Frame.export(assignment, "/tmp/assignment", "yada", true);
    //    Log.info("Exemplars: " + new Frame(new
    // Vec[]{(Vec)agg._exemplar_assignment_vec_key.get()}).toString(0,20000));
    Log.info("Number of exemplars: " + agg._exemplars.length);

    Key<Frame> memberKey = Key.make();
    for (int i = 0; i < agg._exemplars.length; ++i) {
      Frame members = agg.scoreExemplarMembers(memberKey, i);
      assert (members.numRows() == agg._counts[i]);
      //    Log.info(members);
      members.delete();
    }

    Frame output = agg._output._output_frame.get();
    output.remove();
    Log.info("Number of exemplars: " + agg._exemplars.length);
    //    Assert.assertTrue(agg._exemplars.length==615);
    frame.delete();
    agg.remove();
  }
Пример #18
0
 public Frame extractFrame(int startIdx, int endIdx) {
   Frame f = subframe(startIdx, endIdx);
   remove(startIdx, endIdx);
   return f;
 }
Пример #19
0
  // @Ignore("PUBDEV-1648")
  @Test
  public void testRandomCarsGrid() {
    Grid grid = null;
    DRFModel drfRebuilt = null;
    Frame fr = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove();
      Vec old = fr.remove("economy (mpg)");
      fr.add("economy (mpg)", old); // response to last column
      DKV.put(fr);

      // Setup random hyperparameter search space
      HashMap<String, Object[]> hyperParms = new HashMap<>();

      // Construct random grid search space
      long seed = System.nanoTime();
      Random rng = new Random(seed);

      // Limit to 1-3 randomly, 4 times.  Average total number of models is
      // 2^4, or 16.  Max is 81 models.
      Integer ntreesDim = rng.nextInt(3) + 1;
      Integer maxDepthDim = rng.nextInt(3) + 1;
      Integer mtriesDim = rng.nextInt(3) + 1;
      Integer sampleRateDim = rng.nextInt(3) + 1;

      Integer[] ntreesArr = interval(1, 15);
      ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr));
      Collections.shuffle(ntreesList);
      Integer[] ntreesSpace = new Integer[ntreesDim];
      for (int i = 0; i < ntreesDim; i++) {
        ntreesSpace[i] = ntreesList.get(i);
      }

      Integer[] maxDepthArr = interval(1, 10);
      ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr));
      Collections.shuffle(maxDepthList);
      Integer[] maxDepthSpace = new Integer[maxDepthDim];
      for (int i = 0; i < maxDepthDim; i++) {
        maxDepthSpace[i] = maxDepthList.get(i);
      }

      Integer[] mtriesArr = interval(1, 5);
      ArrayList<Integer> mtriesList = new ArrayList<>(Arrays.asList(mtriesArr));
      Collections.shuffle(mtriesList);
      Integer[] mtriesSpace = new Integer[mtriesDim];
      for (int i = 0; i < mtriesDim; i++) {
        mtriesSpace[i] = mtriesList.get(i);
      }

      Double[] sampleRateArr = interval(0.01, 0.99, 0.01);
      ArrayList<Double> sampleRateList = new ArrayList<>(Arrays.asList(sampleRateArr));
      Collections.shuffle(sampleRateList);
      Double[] sampleRateSpace = new Double[sampleRateDim];
      for (int i = 0; i < sampleRateDim; i++) {
        sampleRateSpace[i] = sampleRateList.get(i);
      }

      hyperParms.put("_ntrees", ntreesSpace);
      hyperParms.put("_max_depth", maxDepthSpace);
      hyperParms.put("_mtries", mtriesSpace);
      hyperParms.put("_sample_rate", sampleRateSpace);

      // Fire off a grid search
      DRFModel.DRFParameters params = new DRFModel.DRFParameters();
      params._train = fr._key;
      params._response_column = "economy (mpg)";
      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = gs.get();

      System.out.println("Test seed: " + seed);
      System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace));
      System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace));
      System.out.println("mtries search space: " + Arrays.toString(mtriesSpace));
      System.out.println("sample_rate search space: " + Arrays.toString(sampleRateSpace));

      // Check that cardinality of grid
      Model[] ms = grid.getModels();
      int numModels = ms.length;
      System.out.println("Grid consists of " + numModels + " models");
      assertEquals(
          "Number of models should match hyper space size",
          numModels,
          ntreesDim * maxDepthDim * sampleRateDim * mtriesDim + grid.getFailureCount());

      // Pick a random model from the grid
      HashMap<String, Object[]> randomHyperParms = new HashMap<>();

      Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)];
      randomHyperParms.put("_ntrees", new Integer[] {ntreeVal});

      Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)];
      randomHyperParms.put("_max_depth", maxDepthSpace);

      Integer mtriesVal = mtriesSpace[rng.nextInt(mtriesSpace.length)];
      randomHyperParms.put("_max_depth", mtriesSpace);

      Double sampleRateVal = sampleRateSpace[rng.nextInt(sampleRateSpace.length)];
      randomHyperParms.put("_sample_rate", sampleRateSpace);

      // TODO: DRFModel drfFromGrid = (DRFModel) g2.model(randomHyperParms).get();

      // Rebuild it with it's parameters
      params._ntrees = ntreeVal;
      params._max_depth = maxDepthVal;
      params._mtries = mtriesVal;
      drfRebuilt = new DRF(params).trainModel().get();

      // Make sure the MSE metrics match
      // double fromGridMSE = drfFromGrid._output._scored_train[drfFromGrid._output._ntrees]._mse;
      double rebuiltMSE = drfRebuilt._output._scored_train[drfRebuilt._output._ntrees]._mse;
      // System.out.println("The random grid model's MSE: " + fromGridMSE);
      System.out.println("The rebuilt model's MSE: " + rebuiltMSE);
      // assertEquals(fromGridMSE, rebuiltMSE);

    } finally {
      if (fr != null) {
        fr.remove();
      }
      if (grid != null) {
        grid.remove();
      }
      if (drfRebuilt != null) {
        drfRebuilt.remove();
      }
    }
  }
Пример #20
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return deep_clone();
   int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = (cols[i++] - _catOffsets[j]) + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   int[] catModes = _catModes;
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] cs = new int[_cats - ignoredCnt][];
     catModes = new int[_cats - ignoredCnt];
     int y = 0;
     for (int c = 0; c < catLvls.length; ++c)
       if (catLvls[c] != null) {
         catModes[y] = _catModes[c];
         cs[y++] = catLvls[c];
       }
     assert y == cs.length;
     catLvls = cs;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id - hasIcpt;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels,
   // TransformType predictor_transform, TransformType response_transform, boolean skipMissing,
   // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) {
   DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes);
   dinfo._activeCols = cols;
   return dinfo;
 }
Пример #21
0
  @Test
  public void testLosses() throws InterruptedException, ExecutionException {
    long seed = 0xDECAF;
    Random rng = new Random(seed);
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS
    final GLRMParameters.Regularizer[] regs =
        new GLRMParameters.Regularizer[] {
          GLRMParameters.Regularizer.Quadratic,
          GLRMParameters.Regularizer.L1,
          GLRMParameters.Regularizer.NonNegative,
          GLRMParameters.Regularizer.OneSparse,
          GLRMParameters.Regularizer.UnitOneSparse,
          GLRMParameters.Regularizer.Simplex
        };

    Scope.enter();
    try {
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      for (GLRMParameters.Loss loss :
          new GLRMParameters.Loss[] {
            GLRMParameters.Loss.Quadratic,
            GLRMParameters.Loss.Absolute,
            GLRMParameters.Loss.Huber,
            GLRMParameters.Loss.Poisson,
            GLRMParameters.Loss.Hinge,
            GLRMParameters.Loss.Logistic
          }) {
        for (GLRMParameters.Loss multiloss :
            new GLRMParameters.Loss[] {
              GLRMParameters.Loss.Categorical, GLRMParameters.Loss.Ordinal
            }) {
          GLRMModel model = null;
          try {
            Scope.enter();
            long myseed = rng.nextLong();
            Log.info("GLRM using seed = " + myseed);

            GLRMParameters parms = new GLRMParameters();
            parms._train = train._key;
            parms._transform = DataInfo.TransformType.NONE;
            parms._k = 5;
            parms._loss = loss;
            parms._multi_loss = multiloss;
            parms._init = GLRM.Initialization.SVD;
            parms._regularization_x = regs[rng.nextInt(regs.length)];
            parms._regularization_y = regs[rng.nextInt(regs.length)];
            parms._gamma_x = Math.abs(rng.nextDouble());
            parms._gamma_y = Math.abs(rng.nextDouble());
            parms._recover_svd = false;
            parms._seed = myseed;
            parms._verbose = false;
            parms._max_iterations = 500;

            GLRM job = new GLRM(parms);
            try {
              model = job.trainModel().get();
              Log.info(
                  "Iteration "
                      + model._output._iterations
                      + ": Objective value = "
                      + model._output._objective);
              model.score(train).delete();
              ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
              Log.info(
                  "Numeric Sum of Squared Error = "
                      + mm._numerr
                      + "\tCategorical Misclassification Error = "
                      + mm._caterr);
            } catch (Throwable t) {
              throw t;
            } finally {
              job.remove();
            }
          } catch (Throwable t) {
            t.printStackTrace();
            throw new RuntimeException(t);
          } finally {
            if (model != null) model.delete();
            Scope.exit();
          }
        }
      }
    } finally {
      if (train != null) train.delete();
      Scope.exit();
    }
  }
Пример #22
0
  @Test
  public void testChicago() {
    Frame weather = null, crimes = null, census = null;
    String oldtz = Exec.exec("(getTimeZone)").getStr();
    try {
      weather = parse_test_file(Key.make("weather.hex"), "smalldata/chicago/chicagoAllWeather.csv");
      crimes =
          parse_test_file(Key.make("crimes.hex"), "smalldata/chicago/chicagoCrimes10k.csv.zip");
      String fname = "smalldata/chicago/chicagoCensus.csv";
      File f = find_test_file(fname);
      assert f != null && f.exists() : " file not found: " + fname;
      NFSFileVec nfs = NFSFileVec.make(f);
      ParseSetup ps = ParseSetup.guessSetup(new Key[] {nfs._key}, false, 1);
      ps.getColumnTypes()[1] = Vec.T_ENUM;
      census = ParseDataset.parse(Key.make("census.hex"), new Key[] {nfs._key}, true, ps);

      census =
          exec_str(
              "(colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area.Number\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"])",
              "census.hex");

      crimes =
          exec_str(
              "(colnames= crimes.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] [\"ID\" \"Case.Number\" \"Date\" \"Block\" \"IUCR\" \"Primary.Type\" \"Description\" \"Location.Description\" \"Arrest\" \"Domestic\" \"Beat\" \"District\" \"Ward\" \"Community.Area\" \"FBI.Code\" \"X.Coordinate\" \"Y.Coordinate\" \"Year\" \"Updated.On\" \"Latitude\" \"Longitude\" \"Location\"])",
              "crimes.hex");

      exec_str("(setTimeZone \"Etc/UTC\")", null);

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= unary_op_6 (day (tmp= nary_op_5 (as.Date (cols crimes.hex [2]) \"%m/%d/%Y %I:%M:%S %p\")))) [22] [0:9999]) 22 \"Day\")",
              "crimes.hex");

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= binary_op_31 (+ (tmp= unary_op_7 (month nary_op_5)) #1)) [23] [0:9999]) 23 \"Month\")",
              "crimes.hex");

      Keyed.remove(Key.make("nary_op_30"));

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= binary_op_32 (+ (tmp= binary_op_9 (- (tmp= unary_op_8 (year nary_op_5)) #1900)) #1900)) [17] [0:9999]) 17 \"Year\")",
              "crimes.hex");

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= unary_op_10 (week nary_op_5)) [24] [0:9999]) 24 \"WeekNum\")",
              "crimes.hex");

      Keyed.remove(Key.make("binary_op_32"));
      Keyed.remove(Key.make("binary_op_31"));
      Keyed.remove(Key.make("unary_op_8"));
      checkSaneFrame();

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= unary_op_11 (dayOfWeek nary_op_5)) [25] [0:9999]) 25 \"WeekDay\")",
              "crimes.hex");
      Keyed.remove(
          Key.make(
              "nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-3\\smalldata\\chicago\\chicagoCrimes10k.csv.zip"));

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= unary_op_12 (hour nary_op_5)) [26] [0:9999]) 26 \"HourOfDay\")",
              "crimes.hex");

      crimes =
          exec_str(
              "(colnames= (= crimes.hex (tmp= nary_op_16 (ifelse (tmp= binary_op_15 (| (tmp= binary_op_13 (== unary_op_11 \"Sun\")) (tmp= binary_op_14 (== unary_op_11 \"Sat\")))) 1 0)) [27] [0:9999]) 27 \"Weekend\")",
              "crimes.hex");

      // Season is incorrectly assigned in the original chicago demo; picks up the Weekend flag
      crimes =
          exec_str(
              "(colnames= (= crimes.hex nary_op_16 [28] [0:9999]) 28 \"Season\")", "crimes.hex");

      // Standard "head of 10 rows" pattern for printing
      Frame subset_33 = exec_str("(rows crimes.hex [0:10])", "subset_33");
      Keyed.remove(Key.make("subset_33"));

      Keyed.remove(Key.make("subset_33"));
      Keyed.remove(Key.make("unary_op_29"));
      Keyed.remove(Key.make("nary_op_28"));
      Keyed.remove(Key.make("nary_op_27"));
      Keyed.remove(Key.make("nary_op_26"));
      Keyed.remove(Key.make("binary_op_25"));
      Keyed.remove(Key.make("binary_op_24"));
      Keyed.remove(Key.make("binary_op_23"));
      Keyed.remove(Key.make("binary_op_22"));
      Keyed.remove(Key.make("binary_op_21"));
      Keyed.remove(Key.make("binary_op_20"));
      Keyed.remove(Key.make("binary_op_19"));
      Keyed.remove(Key.make("binary_op_18"));
      Keyed.remove(Key.make("binary_op_17"));
      Keyed.remove(Key.make("nary_op_16"));
      Keyed.remove(Key.make("binary_op_15"));
      Keyed.remove(Key.make("binary_op_14"));
      Keyed.remove(Key.make("binary_op_13"));
      Keyed.remove(Key.make("unary_op_12"));
      Keyed.remove(Key.make("unary_op_11"));
      Keyed.remove(Key.make("unary_op_10"));
      Keyed.remove(Key.make("binary_op_9"));
      Keyed.remove(Key.make("unary_op_8"));
      Keyed.remove(Key.make("unary_op_7"));
      Keyed.remove(Key.make("unary_op_6"));
      Keyed.remove(Key.make("nary_op_5"));
      checkSaneFrame();

      // Standard "head of 10 rows" pattern for printing
      Frame subset_34 = exec_str("(rows crimes.hex [0:10])", "subset_34");
      Keyed.remove(Key.make("subset_34"));

      census =
          exec_str(
              "(colnames= census.hex [0 1 2 3 4 5 6 7 8] [\"Community.Area\" \"COMMUNITY.AREA.NAME\" \"PERCENT.OF.HOUSING.CROWDED\" \"PERCENT.HOUSEHOLDS.BELOW.POVERTY\" \"PERCENT.AGED.16..UNEMPLOYED\" \"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA\" \"PERCENT.AGED.UNDER.18.OR.OVER.64\" \"PER.CAPITA.INCOME.\" \"HARDSHIP.INDEX\"])",
              "census.hex");
      Keyed.remove(Key.make("subset_34"));

      Frame subset_35 = exec_str("(cols  crimes.hex [-3])", "subset_35");
      Frame subset_36 = exec_str("(cols weather.hex [-1])", "subset_36");

      subset_36 =
          exec_str(
              "(colnames= subset_36 [0 1 2 3 4 5] [\"Month\" \"Day\" \"Year\" \"maxTemp\" \"meanTemp\" \"minTemp\"])",
              "subset_36");

      crimes.remove();
      weather.remove();

      // nary_op_37 = merge( X Y ); Vecs in X & nary_op_37 shared
      Frame nary_op_37 = exec_str("(merge subset_35 census.hex FALSE FALSE)", "nary_op_37");

      // nary_op_38 = merge( nary_op_37 subset_36); Vecs in nary_op_38 and nary_pop_37 and X shared
      Frame subset_41 =
          exec_str(
              "(rows (tmp= nary_op_38 (merge nary_op_37 subset_36 TRUE FALSE)) (tmp= binary_op_40 (<= (tmp= nary_op_39 (h2o.runif nary_op_38 30792152736.5179)) #0.8)))",
              "subset_41");

      // Standard "head of 10 rows" pattern for printing
      Frame subset_44 = exec_str("(rows subset_41 [0:10])", "subset_44");
      Keyed.remove(Key.make("subset_44"));
      Keyed.remove(Key.make("subset_44"));
      Keyed.remove(Key.make("binary_op_40"));
      Keyed.remove(Key.make("nary_op_37"));

      Frame subset_43 =
          exec_str("(rows nary_op_38 (tmp= binary_op_42 (> nary_op_39 #0.8)))", "subset_43");

      // Chicago demo continues on past, but this is all I've captured for now

      checkSaneFrame();

    } finally {
      Exec.exec(
          "(setTimeZone \""
              + oldtz
              + "\")"); // Restore time zone (which is global, and will affect following tests)
      if (weather != null) weather.remove();
      if (crimes != null) crimes.remove();
      if (census != null) census.remove();

      for (String s :
          new String[] {
            "nary_op_5",
            "unary_op_6",
            "unary_op_7",
            "unary_op_8",
            "binary_op_9",
            "unary_op_10",
            "unary_op_11",
            "unary_op_12",
            "binary_op_13",
            "binary_op_14",
            "binary_op_15",
            "nary_op_16",
            "binary_op_17",
            "binary_op_18",
            "binary_op_19",
            "binary_op_20",
            "binary_op_21",
            "binary_op_22",
            "binary_op_23",
            "binary_op_24",
            "binary_op_25",
            "nary_op_26",
            "nary_op_27",
            "nary_op_28",
            "unary_op_29",
            "binary_op_30",
            "binary_op_31",
            "binary_op_32",
            "subset_33",
            "subset_34",
            "subset_35",
            "subset_36",
            "nary_op_37",
            "nary_op_38",
            "nary_op_39",
            "binary_op_40",
            "subset_41",
            "binary_op_42",
            "subset_43",
            "subset_44",
          }) Keyed.remove(Key.make(s));
    }
  }
Пример #23
0
  @Test
  public void testSetColumnLossCats() throws InterruptedException, ExecutionException {
    GLRM job = null;
    GLRMModel model = null;
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    Scope.enter();
    try {
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      GLRMParameters parms = new GLRMParameters();
      parms._train = train._key;
      parms._k = 12;
      parms._loss = GLRMParameters.Loss.Quadratic;
      parms._multi_loss = GLRMParameters.Loss.Categorical;
      parms._loss_by_col =
          new GLRMParameters.Loss[] {
            GLRMParameters.Loss.Ordinal, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Absolute
          };
      parms._loss_by_col_idx = new int[] {3 /* DPROS */, 1 /* AGE */, 6 /* VOL */};
      parms._init = GLRM.Initialization.PlusPlus;
      parms._min_step_size = 1e-5;
      parms._recover_svd = false;
      parms._max_iterations = 2000;

      try {
        job = new GLRM(parms);
        model = job.trainModel().get();
        Log.info(
            "Iteration "
                + model._output._iterations
                + ": Objective value = "
                + model._output._objective);
        GLRMTest.checkLossbyCol(parms, model);

        model.score(train).delete();
        ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
        Log.info(
            "Numeric Sum of Squared Error = "
                + mm._numerr
                + "\tCategorical Misclassification Error = "
                + mm._caterr);
      } catch (Throwable t) {
        t.printStackTrace();
        throw new RuntimeException(t);
      } finally {
        job.remove();
      }

    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (train != null) train.delete();
      if (model != null) model.delete();
      Scope.exit();
    }
  }
Пример #24
0
  @Test
  public void testExpandCatsProstate() throws InterruptedException, ExecutionException {
    double[][] prostate =
        ard(
            ard(0, 71, 1, 0, 0, 4.8, 14.0, 7),
            ard(1, 70, 1, 1, 0, 8.4, 21.8, 5),
            ard(0, 73, 1, 3, 0, 10.0, 27.4, 6),
            ard(1, 68, 1, 0, 0, 6.7, 16.7, 6));
    double[][] pros_expandR =
        ard(
            ard(1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 71, 4.8, 14.0, 7),
            ard(0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 70, 8.4, 21.8, 5),
            ard(0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 73, 10.0, 27.4, 6),
            ard(1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 68, 6.7, 16.7, 6));
    String[] pros_cols =
        new String[] {"Capsule", "Age", "Race", "Dpros", "Dcaps", "PSA", "Vol", "Gleason"};
    String[][] pros_domains =
        new String[][] {
          new String[] {"No", "Yes"},
          null,
          new String[] {"Other", "White", "Black"},
          new String[] {"None", "UniLeft", "UniRight", "Bilobar"},
          new String[] {"No", "Yes"},
          null,
          null,
          null
        };
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    Frame fr = null;
    try {
      Scope.enter();
      fr = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(fr.replace(cats[i], fr.vec(cats[i]).toCategoricalVec())._key);
      fr.remove("ID").remove();
      DKV.put(fr._key, fr);
      DataInfo dinfo =
          new DataInfo(
              Key.make(),
              fr,
              null,
              0,
              true,
              DataInfo.TransformType.NONE,
              DataInfo.TransformType.NONE,
              false,
              false,
              false, /* weights */
              false, /* offset */
              false, /* fold */
              false);

      Log.info("Original matrix:\n" + colFormat(pros_cols, "%8.7s") + ArrayUtils.pprint(prostate));
      double[][] pros_perm = ArrayUtils.permuteCols(prostate, dinfo._permutation);
      Log.info(
          "Permuted matrix:\n"
              + colFormat(pros_cols, "%8.7s", dinfo._permutation)
              + ArrayUtils.pprint(pros_perm));

      double[][] pros_exp = GLRM.expandCats(pros_perm, dinfo);
      Log.info(
          "Expanded matrix:\n"
              + colExpFormat(pros_cols, pros_domains, "%8.7s", dinfo._permutation)
              + ArrayUtils.pprint(pros_exp));
      Assert.assertArrayEquals(pros_expandR, pros_exp);
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (fr != null) fr.delete();
      Scope.exit();
    }
  }
Пример #25
0
  void testModelAdaptation(String train, String test, PrepData dprep, boolean exactAdaptation) {
    DRFModel model = null;
    Frame frTest = null;
    Frame frTrain = null;
    Key trainKey = Key.make("train.hex");
    Key testKey = Key.make("test.hex");
    Frame[] frAdapted = null;
    try {
      // Prepare a simple model
      frTrain = parseFrame(trainKey, train);
      model = runDRF(frTrain, dprep);
      // Load test dataset - test data contains input columns matching train data,
      // BUT each input requires adaptation. Moreover, test data contains additional columns
      // containing correct value mapping.
      frTest = parseFrame(testKey, test);
      Assert.assertEquals(
          "TEST CONF ERROR: The test dataset should contain 2*<number of input columns>+1!",
          2 * (frTrain.numCols() - 1) + 1,
          frTest.numCols());
      // Adapt test dataset
      frAdapted = model.adapt(frTest, exactAdaptation); // do/do not perform translation to enums
      Assert.assertEquals("Adapt method should return two frames", 2, frAdapted.length);
      Assert.assertEquals(
          "Test expects that all columns in  test dataset has to be adapted",
          dprep.needAdaptation(frTrain),
          frAdapted[1].numCols());

      // Compare vectors
      Frame adaptedFrame = frAdapted[0];
      // System.err.println(frTest.toStringAll());
      // System.err.println(adaptedFrame.toStringAll());

      for (int av = 0; av < frTrain.numCols() - 1; av++) {
        int ev = av + frTrain.numCols();
        Vec actV = adaptedFrame.vecs()[av];
        Vec expV = frTest.vecs()[ev];
        Assert.assertEquals(
            "Different number of rows in test vectors", expV.length(), actV.length());
        for (long r = 0; r < expV.length(); r++) {
          if (expV.isNA(r))
            Assert.assertTrue(
                "Badly adapted vector - expected NA! Col: " + av + ", row: " + r, actV.isNA(r));
          else {
            Assert.assertTrue(
                "Badly adapted vector - expected value but get NA! Col: " + av + ", row: " + r,
                !actV.isNA(r));
            Assert.assertEquals(
                "Badly adapted vector - wrong values! Col: " + av + ", row: " + r,
                expV.at8(r),
                actV.at8(r));
          }
        }
      }

    } finally {
      // Test cleanup
      if (model != null) UKV.remove(model._selfKey);
      if (frTrain != null) frTrain.remove();
      UKV.remove(trainKey);
      if (frTest != null) frTest.remove();
      UKV.remove(testKey);
      // Remove adapted vectors which were saved into KV-store, rest of vectors are remove by
      // frTest.remove()
      if (frAdapted != null) frAdapted[1].remove();
    }
  }
Пример #26
0
 public void dropWeights() {
   if (!_weights) return;
   _adaptedFrame.remove(weightChunkId());
   _weights = false;
 }
Пример #27
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return this;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = cols[i++] - _catOffsets[j] + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] c = new int[_cats - ignoredCnt][];
     int y = 0;
     for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl;
     assert y == c.length;
     catLvls = c;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   DataInfo dinfo =
       new DataInfo(
           _key,
           f,
           normMul,
           normSub,
           catLvls,
           _responses,
           _predictor_transform,
           _response_transform,
           _skipMissing,
           _imputeMissing,
           _weights,
           _offset,
           _fold);
   // do not put activeData into K/V - active data is recreated on each node based on active
   // columns
   dinfo._activeCols = cols;
   return dinfo;
 }
Пример #28
0
  // @Ignore("PUBDEV-1648")
  @Test
  public void testRandomCarsGrid() {
    Grid grid = null;
    GBMModel gbmRebuilt = null;
    Frame fr = null;
    Vec old = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove();
      old = fr.remove("economy (mpg)");

      fr.add("economy (mpg)", old); // response to last column
      DKV.put(fr);

      // Setup random hyperparameter search space
      HashMap<String, Object[]> hyperParms = new HashMap<>();
      hyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian});

      // Construct random grid search space
      Random rng = new Random();

      Integer ntreesDim = rng.nextInt(4) + 1;
      Integer maxDepthDim = rng.nextInt(4) + 1;
      Integer learnRateDim = rng.nextInt(4) + 1;

      Integer[] ntreesArr = interval(1, 25);
      ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr));
      Collections.shuffle(ntreesList);
      Integer[] ntreesSpace = new Integer[ntreesDim];
      for (int i = 0; i < ntreesDim; i++) {
        ntreesSpace[i] = ntreesList.get(i);
      }

      Integer[] maxDepthArr = interval(1, 10);
      ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr));
      Collections.shuffle(maxDepthList);
      Integer[] maxDepthSpace = new Integer[maxDepthDim];
      for (int i = 0; i < maxDepthDim; i++) {
        maxDepthSpace[i] = maxDepthList.get(i);
      }

      Double[] learnRateArr = interval(0.01, 1.0, 0.01);
      ArrayList<Double> learnRateList = new ArrayList<>(Arrays.asList(learnRateArr));
      Collections.shuffle(learnRateList);
      Double[] learnRateSpace = new Double[learnRateDim];
      for (int i = 0; i < learnRateDim; i++) {
        learnRateSpace[i] = learnRateList.get(i);
      }

      hyperParms.put("_ntrees", ntreesSpace);
      hyperParms.put("_max_depth", maxDepthSpace);
      hyperParms.put("_learn_rate", learnRateSpace);

      // Fire off a grid search
      GBMModel.GBMParameters params = new GBMModel.GBMParameters();
      params._train = fr._key;
      params._response_column = "economy (mpg)";
      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = gs.get();

      System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace));
      System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace));
      System.out.println("learn_rate search space: " + Arrays.toString(learnRateSpace));

      // Check that cardinality of grid
      Model[] ms = grid.getModels();
      Integer numModels = ms.length;
      System.out.println("Grid consists of " + numModels + " models");
      assertTrue(numModels == ntreesDim * maxDepthDim * learnRateDim);

      // Pick a random model from the grid
      HashMap<String, Object[]> randomHyperParms = new HashMap<>();
      randomHyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian});

      Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)];
      randomHyperParms.put("_ntrees", new Integer[] {ntreeVal});

      Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)];
      randomHyperParms.put("_max_depth", maxDepthSpace);

      Double learnRateVal = learnRateSpace[rng.nextInt(learnRateSpace.length)];
      randomHyperParms.put("_learn_rate", learnRateSpace);

      // TODO: GBMModel gbmFromGrid = (GBMModel) g2.model(randomHyperParms).get();

      // Rebuild it with it's parameters
      params._distribution = DistributionFamily.gaussian;
      params._ntrees = ntreeVal;
      params._max_depth = maxDepthVal;
      params._learn_rate = learnRateVal;
      GBM gbm = new GBM(params);
      gbmRebuilt = gbm.trainModel().get();
      assertTrue(gbm.isStopped());

      // Make sure the MSE metrics match
      // double fromGridMSE = gbmFromGrid._output._scored_train[gbmFromGrid._output._ntrees]._mse;
      double rebuiltMSE = gbmRebuilt._output._scored_train[gbmRebuilt._output._ntrees]._mse;
      // System.out.println("The random grid model's MSE: " + fromGridMSE);
      System.out.println("The rebuilt model's MSE: " + rebuiltMSE);
      // assertEquals(fromGridMSE, rebuiltMSE);

    } finally {
      if (old != null) old.remove();
      if (fr != null) fr.remove();
      if (grid != null) grid.remove();
      if (gbmRebuilt != null) gbmRebuilt.remove();
    }
  }
Пример #29
0
  @Test
  public void testCarsGrid() {
    Grid<GBMModel.GBMParameters> grid = null;
    Frame fr = null;
    Vec old = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove(); // Remove unique id
      old = fr.remove("cylinders");
      fr.add("cylinders", old.toCategoricalVec()); // response to last column
      DKV.put(fr);

      // Setup hyperparameter search space
      final Double[] legalLearnRateOpts = new Double[] {0.01, 0.1, 0.3};
      final Double[] illegalLearnRateOpts = new Double[] {-1.0};
      HashMap<String, Object[]> hyperParms =
          new HashMap<String, Object[]>() {
            {
              put("_ntrees", new Integer[] {1, 2});
              put("_distribution", new DistributionFamily[] {DistributionFamily.multinomial});
              put("_max_depth", new Integer[] {1, 2, 5});
              put("_learn_rate", ArrayUtils.join(legalLearnRateOpts, illegalLearnRateOpts));
            }
          };

      // Name of used hyper parameters
      String[] hyperParamNames = hyperParms.keySet().toArray(new String[hyperParms.size()]);
      Arrays.sort(hyperParamNames);
      int hyperSpaceSize = ArrayUtils.crossProductSize(hyperParms);

      // Fire off a grid search
      GBMModel.GBMParameters params = new GBMModel.GBMParameters();
      params._train = fr._key;
      params._response_column = "cylinders";
      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = (Grid<GBMModel.GBMParameters>) gs.get();
      // Make sure number of produced models match size of specified hyper space
      Assert.assertEquals(
          "Size of grid (models+failures) should match to size of hyper space",
          hyperSpaceSize,
          grid.getModelCount() + grid.getFailureCount());
      //
      // Make sure that names of used parameters match
      //
      String[] gridHyperNames = grid.getHyperNames();
      Arrays.sort(gridHyperNames);
      Assert.assertArrayEquals(
          "Hyper parameters names should match!", hyperParamNames, gridHyperNames);

      //
      // Make sure that values of used parameters match as well to the specified values
      //
      Key<Model>[] mKeys = grid.getModelKeys();
      Map<String, Set<Object>> usedHyperParams = GridTestUtils.initMap(hyperParamNames);
      for (Key<Model> mKey : mKeys) {
        GBMModel gbm = (GBMModel) mKey.get();
        System.out.println(
            gbm._output._scored_train[gbm._output._ntrees]._mse
                + " "
                + Arrays.deepToString(
                    ArrayUtils.zip(grid.getHyperNames(), grid.getHyperValues(gbm._parms))));
        GridTestUtils.extractParams(usedHyperParams, gbm._parms, hyperParamNames);
      }
      // Remove illegal options
      hyperParms.put("_learn_rate", legalLearnRateOpts);
      GridTestUtils.assertParamsEqual(
          "Grid models parameters have to cover specified hyper space",
          hyperParms,
          usedHyperParams);

      // Verify model failure
      Map<String, Set<Object>> failedHyperParams = GridTestUtils.initMap(hyperParamNames);
      ;
      for (Model.Parameters failedParams : grid.getFailedParameters()) {
        GridTestUtils.extractParams(failedHyperParams, failedParams, hyperParamNames);
      }
      hyperParms.put("_learn_rate", illegalLearnRateOpts);
      GridTestUtils.assertParamsEqual(
          "Failed model parameters have to correspond to specified hyper space",
          hyperParms,
          failedHyperParams);

    } finally {
      if (old != null) {
        old.remove();
      }
      if (fr != null) {
        fr.remove();
      }
      if (grid != null) {
        grid.remove();
      }
    }
  }
Пример #30
0
 public void remove() {
   remove(new Futures());
 }