Beispiel #1
0
  @SuppressWarnings("unused") // called through reflection by RequestServer
  public RemoveAllV3 remove(int version, RemoveAllV3 u) {
    Log.info("Removing all objects");
    Futures fs = new Futures();
    for (Job j : Job.jobs()) {
      j.cancel();
      j.remove(fs);
    }
    fs.blockForPending();
    // Bulk brainless key removal.  Completely wipes all Keys without regard.
    new MRTask() {
      @Override
      public byte priority() {
        return H2O.GUI_PRIORITY;
      }

      @Override
      public void setupLocal() {
        H2O.raw_clear();
        water.fvec.Vec.ESPC.clear();
      }
    }.doAllNodes();
    Log.info("Finished removing objects");
    return u;
  }
Beispiel #2
0
 @Override
 protected Response serve() {
   init();
   link = family.defaultLink; // TODO
   tweedie_link_power = 1 - tweedie_variance_power; // TODO
   _glm = new GLMParams(family, tweedie_variance_power, link, tweedie_link_power);
   if (alpha.length > 1) { // grid search
     if (destination_key == null) destination_key = Key.make("GLMGridResults_" + Key.make());
     if (job_key == null) job_key = Key.make((byte) 0, Key.JOB, H2O.SELF);
     ;
     Job j =
         gridSearch(
             self(),
             destination_key,
             _dinfo,
             _glm,
             lambda,
             lambda_search,
             alpha,
             higher_accuracy,
             n_folds);
     return GLMGridView.redirect(this, j.dest());
   } else {
     if (destination_key == null) destination_key = Key.make("GLMModel_" + Key.make());
     if (job_key == null) job_key = Key.make("GLM2Job_" + Key.make());
     fork();
     return GLMProgress.redirect(this, job_key, dest());
   }
 }
  /** Score a frame with the given model and return the metrics AND the prediction frame. */
  @SuppressWarnings("unused") // called through reflection by RequestServer
  public JobV3 predict2(int version, final ModelMetricsListSchemaV3 s) {
    // parameters checking:
    if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model);
    if (null == DKV.get(s.model.name))
      throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name);

    if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame);
    if (null == DKV.get(s.frame.name))
      throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name);

    final ModelMetricsList parms = s.createAndFillImpl();

    // predict2 does not return modelmetrics, so cannot handle deeplearning: reconstruction_error
    // (anomaly) or GLRM: reconstruct and archetypes
    // predict2 can handle deeplearning: deepfeatures and predict

    if (s.deep_features_hidden_layer > 0) {
      if (null == parms._predictions_name)
        parms._predictions_name =
            "deep_features"
                + Key.make().toString().substring(0, 5)
                + "_"
                + parms._model._key.toString()
                + "_on_"
                + parms._frame._key.toString();
    } else if (null == parms._predictions_name)
      parms._predictions_name =
          "predictions"
              + Key.make().toString().substring(0, 5)
              + "_"
              + parms._model._key.toString()
              + "_on_"
              + parms._frame._key.toString();

    final Job<Frame> j =
        new Job(Key.make(parms._predictions_name), Frame.class.getName(), "prediction");

    H2O.H2OCountedCompleter work =
        new H2O.H2OCountedCompleter() {
          @Override
          public void compute2() {
            if (s.deep_features_hidden_layer < 0) {
              parms._model.score(parms._frame, parms._predictions_name, j);
            } else {
              Frame predictions =
                  ((Model.DeepFeatures) parms._model)
                      .scoreDeepFeatures(parms._frame, s.deep_features_hidden_layer, j);
              predictions =
                  new Frame(
                      Key.make(parms._predictions_name), predictions.names(), predictions.vecs());
              DKV.put(predictions._key, predictions);
            }
            tryComplete();
          }
        };
    j.start(work, parms._frame.anyVec().nChunks());
    return new JobV3().fillFromImpl(j);
  }
Beispiel #4
0
 public JobsV3 cancel(int version, JobsV3 c) {
   Job j = DKV.getGet(c.job_id.key());
   if (j == null) {
     throw new IllegalArgumentException("No job with key " + c.job_id.key());
   }
   j.stop(); // Request Job stop
   return c;
 }
Beispiel #5
0
 @Override
 public boolean toHTML(StringBuilder sb) {
   Job jjob = Job.findJob(job_key);
   DRFModel m = UKV.get(jjob.dest());
   if (m != null) m.generateHTML("DRF Model", sb);
   else DocGen.HTML.paragraph(sb, "Pending...");
   return true;
 }
Beispiel #6
0
 /** Finds a job with given dest key or returns null */
 public static final Job findJobByDest(final Key destKey) {
   Job job = null;
   for( Job current : Job.all() ) {
     if( current.dest().equals(destKey) ) {
       job = current;
       break;
     }
   }
   return job;
 }
Beispiel #7
0
 /** Finds a job with given key or returns null */
 public static final Job findJob(final Key key) {
   Job job = null;
   for (Job current : Job.all()) {
     if (current.self().equals(key)) {
       job = current;
       break;
     }
   }
   return job;
 }
Beispiel #8
0
  // Expand grid search related argument sets
  @Override
  protected NanoHTTPD.Response serveGrid(NanoHTTPD server, Properties parms, RequestType type) {
    String[][] values = new String[_arguments.size()][];
    boolean gridSearch = false;
    for (int i = 0; i < _arguments.size(); i++) {
      Argument arg = _arguments.get(i);
      if (arg._gridable) {
        String value = _parms.getProperty(arg._name);
        if (value != null) {
          // Skips grid if argument is an array, except if imbricated expression
          // Little hackish, waiting for real language
          boolean imbricated = value.contains("(");
          if (!arg._field.getType().isArray() || imbricated) {
            values[i] = split(value);
            if (values[i] != null && values[i].length > 1) gridSearch = true;
          } else if (arg._field.getType().isArray()
              && !imbricated) { // Copy values which are arrays
            values[i] = new String[] {value};
          }
        }
      }
    }
    if (!gridSearch) return superServeGrid(server, parms, type);

    // Ignore destination key so that each job gets its own
    _parms.remove("destination_key");
    for (int i = 0; i < _arguments.size(); i++)
      if (_arguments.get(i)._name.equals("destination_key")) values[i] = null;

    // Iterate over all argument combinations
    int[] counters = new int[values.length];
    ArrayList<Job> jobs = new ArrayList<Job>();
    for (; ; ) {
      Job job = (Job) create(_parms);
      Properties combination = new Properties();
      for (int i = 0; i < values.length; i++) {
        if (values[i] != null) {
          String value = values[i][counters[i]];
          value = value.trim();
          combination.setProperty(_arguments.get(i)._name, value);
          _arguments.get(i).reset();
          _arguments.get(i).check(job, value);
        }
      }
      job._parms = combination;
      jobs.add(job);
      if (!increment(counters, values)) break;
    }
    GridSearch grid = new GridSearch();
    grid.jobs = jobs.toArray(new Job[jobs.size()]);
    return grid.superServeGrid(server, parms, type);
  }
Beispiel #9
0
  // @Ignore("PUBDEV-1643")
  @Test
  public void testDuplicatesCarsGrid() {
    Grid grid = null;
    Frame fr = null;
    Vec old = null;
    try {
      fr = parse_test_file("smalldata/junit/cars_20mpg.csv");
      fr.remove("name").remove(); // Remove unique id
      old = fr.remove("economy");
      fr.add("economy", old); // response to last column
      DKV.put(fr);

      // Setup random hyperparameter search space
      HashMap<String, Object[]> hyperParms =
          new HashMap<String, Object[]>() {
            {
              put("_ntrees", new Integer[] {5, 5});
              put("_max_depth", new Integer[] {2, 2});
              put("_mtries", new Integer[] {-1, -1});
              put("_sample_rate", new Double[] {.1, .1});
            }
          };

      // Fire off a grid search
      DRFModel.DRFParameters params = new DRFModel.DRFParameters();
      params._train = fr._key;
      params._response_column = "economy";

      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = gs.get();

      // Check that duplicate model have not been constructed
      Model[] models = grid.getModels();
      assertTrue("Number of returned models has to be > 0", models.length > 0);
      // But all off them should be same
      Key<Model> modelKey = models[0]._key;
      for (Model m : models) {
        assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key);
      }
    } finally {
      if (old != null) {
        old.remove();
      }
      if (fr != null) {
        fr.remove();
      }
      if (grid != null) {
        grid.remove();
      }
    }
  }
Beispiel #10
0
  /**
   * Block synchronously waiting for a job to end, success or not.
   * @param jobkey Job to wait for.
   * @param pollingIntervalMillis Polling interval sleep time.
   */
  public static void waitUntilJobEnded(Key jobkey, int pollingIntervalMillis) {
    while (true) {
      if (Job.isEnded(jobkey)) {
        return;
      }

      try { Thread.sleep (pollingIntervalMillis); } catch (Exception _) {}
    }
  }
Beispiel #11
0
 @Override
 protected Response serve() {
   init();
   link = family.defaultLink; // TODO
   tweedie_link_power = 1 - tweedie_variance_power; // TODO
   Frame fr =
       DataInfo.prepareFrame(source, response, ignored_cols, family == Family.binomial, true);
   _dinfo = new DataInfo(fr, 1, standardize);
   _glm = new GLMParams(family, tweedie_variance_power, link, tweedie_link_power);
   if (alpha.length > 1) { // grid search
     if (destination_key == null) destination_key = Key.make("GLMGridModel_" + Key.make());
     if (job_key == null) job_key = Key.make("GLMGridJob_" + Key.make());
     Job j = gridSearch(self(), destination_key, _dinfo, _glm, lambda, alpha, n_folds);
     return GLMGridView.redirect(this, j.dest());
   } else {
     if (destination_key == null) destination_key = Key.make("GLMModel_" + Key.make());
     if (job_key == null) job_key = Key.make("GLM2Job_" + Key.make());
     fork();
     return GLMProgress.redirect(this, job_key, dest());
   }
 }
Beispiel #12
0
    @Override
    public void lcompute() {
      // Optional: cancel all jobs
      //      for (Job job : Job.all()) {
      //        job.cancel();
      //        Job.waitUntilJobEnded(job.self());
      //      }

      final Set<Key> keySet = H2O.globalKeySet(null);
      for (Key key : keySet) {
        if (!key.home()) continue; // only unlock local keys
        final Value val = DKV.get(key);
        if (val == null) continue;
        if (val.rawPOJO() == null) continue; // need to have a POJO to be locked
        if (!val.isLockable()) continue;
        final Object obj = val.rawPOJO();
        assert (obj instanceof Lockable<?>);
        final Lockable<?> lockable = (Lockable<?>) (obj);
        final Key[] lockers = ((Lockable) obj)._lockers;
        if (lockers != null) {
          // check that none of the locking jobs is still running
          for (Key locker : lockers) {
            if (locker != null && locker.type() == Key.JOB) {
              final Job job = UKV.get(locker);
              if (job != null && job.isRunning())
                throw new UnsupportedOperationException(
                    "Cannot unlock all keys since locking jobs are still running.");
            }
          }
          lockable.unlock_all();
          Log.info("Unlocked key '" + key + "' from " + lockers.length + " lockers.");
        }
      }
      Log.info("All keys are now unlocked.");
      tryComplete();
    }
Beispiel #13
0
  // Start by splitting all the data according to some criteria (minimize
  // variance at the leaves).  Record on each row which split it goes to, and
  // assign a split number to it (for next pass).  On *this* pass, use the
  // split-number to build a per-split histogram, with a per-histogram-bucket
  // variance.
  @Override
  protected GBMModel buildModel(
      GBMModel model,
      final Frame fr,
      String names[],
      String domains[][],
      String[] cmDomain,
      Timer t_build) {

    // Tag out rows missing the response column
    new ExcludeNAResponse().doAll(fr);

    // Build trees until we hit the limit
    int tid;
    DTree[] ktrees = null; // Trees
    TreeStats tstats = new TreeStats(); // Tree stats
    for (tid = 0; tid < ntrees; tid++) {
      // During first iteration model contains 0 trees, then 0-trees, then 1-tree,...
      // BUT if validation is not specified model does not participate in voting
      // but on-the-fly computed data are used
      model = doScoring(model, fr, ktrees, tid, cmDomain, tstats, false, false, false);
      // ESL2, page 387
      // Step 2a: Compute prediction (prob distribution) from prior tree results:
      //   Work <== f(Tree)
      new ComputeProb().doAll(fr);

      // ESL2, page 387
      // Step 2b i: Compute residuals from the prediction (probability distribution)
      //   Work <== f(Work)
      new ComputeRes().doAll(fr);

      // ESL2, page 387, Step 2b ii, iii, iv
      Timer kb_timer = new Timer();
      ktrees = buildNextKTrees(fr);
      Log.info(Sys.GBM__, (tid + 1) + ". tree was built in " + kb_timer.toString());
      if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore

      // Check latest predictions
      tstats.updateBy(ktrees);
    }
    // Final scoring
    model = doScoring(model, fr, ktrees, tid, cmDomain, tstats, true, false, false);

    return model;
  }
Beispiel #14
0
  @Override
  protected DRFModel buildModel(
      DRFModel model, final Frame fr, String names[], String domains[][], final Timer t_build) {
    // Append number of trees participating in on-the-fly scoring
    fr.add("OUT_BAG_TREES", response.makeZero());

    // The RNG used to pick split columns
    Random rand = createRNG(_seed);

    // Prepare working columns
    new SetWrkTask().doAll(fr);

    int tid;
    DTree[] ktrees = null;
    // Prepare tree statistics
    TreeStats tstats = new TreeStats();
    // Build trees until we hit the limit
    for (tid = 0; tid < ntrees; tid++) { // Building tid-tree
      model =
          doScoring(
              model, fr, ktrees, tid, tstats, tid == 0, !hasValidation(), build_tree_one_node);
      // At each iteration build K trees (K = nclass = response column domain size)

      // TODO: parallelize more? build more than k trees at each time, we need to care about
      // temporary data
      // Idea: launch more DRF at once.
      Timer kb_timer = new Timer();
      ktrees = buildNextKTrees(fr, _mtry, sample_rate, rand, tid);
      Log.info(Sys.DRF__, (tid + 1) + ". tree was built " + kb_timer.toString());
      if (!Job.isRunning(self())) break; // If canceled during building, do not bulkscore

      // Check latest predictions
      tstats.updateBy(ktrees);
    }

    model = doScoring(model, fr, ktrees, tid, tstats, true, !hasValidation(), build_tree_one_node);
    // Make sure that we did not miss any votes
    assert !importance
            || _treeMeasuresOnOOB.npredictors() == _treeMeasuresOnSOOB[0 /*variable*/].npredictors()
        : "Missing some tree votes in variable importance voting?!";

    return model;
  }
Beispiel #15
0
  /**
   * Impl class for a collection of jobs; only used in the API to make it easier to cons up the jobs
   * array via the magic of PojoUtils.copyProperties.
   */
  @SuppressWarnings("unused") // called through reflection by RequestServer
  public JobsV3 list(int version, JobsV3 s) {
    Job[] jobs = Job.jobs();
    // Jobs j = new Jobs();
    // j._jobs = Job.jobs();
    // PojoUtils.copyProperties(s, j, PojoUtils.FieldNaming.ORIGIN_HAS_UNDERSCORES);
    s.jobs = new JobV3[jobs.length];

    int i = 0;
    for (Job j : jobs) {
      try {
        s.jobs[i] = (JobV3) Schema.schema(version, j).fillFromImpl(j);
      }
      // no special schema for this job subclass, so fall back to JobV3
      catch (H2ONotFoundArgumentException e) {
        s.jobs[i] = new JobV3().fillFromImpl(j);
      }
      i++; // Java does the increment before the function call which throws?!
    }
    return s;
  }
Beispiel #16
0
 /**
  * Creates a new ValueArray with classes. New ValueArray is not aligned with source one
  * unfortunately so have to send results to each chunk owner using Atomic.
  */
 @Override
 public void map(Key key) {
   assert key.home();
   if (Job.isRunning(_job.self())) {
     ValueArray va = DKV.get(_arykey).get();
     AutoBuffer bits = va.getChunk(key);
     long startRow = va.startRow(ValueArray.getChunkIndex(key));
     int rows = va.rpc(ValueArray.getChunkIndex(key));
     int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE);
     long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE);
     long updatedChk = chunk;
     long updatedRow = startRow;
     double[] values = new double[_cols.length - 1];
     ClusterDist cd = new ClusterDist();
     int[] clusters = new int[rows];
     int count = 0;
     for (int row = 0; row < rows; row++) {
       KMeans.datad(va, bits, row, _cols, _normalized, values);
       KMeans.closest(_clusters, values, cd);
       chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE);
       if (chunk != updatedChk) {
         updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow);
         updatedChk = chunk;
         updatedRow = startRow + row;
         count = 0;
       }
       clusters[count++] = cd._cluster;
     }
     if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow);
     _job.updateProgress(1);
   }
   _job = null;
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
Beispiel #17
0
 @Override
 protected void onCancelled() {
   for (Job job : jobs) job.cancel();
 }
Beispiel #18
0
  // @Ignore("PUBDEV-1648")
  @Test
  public void testRandomCarsGrid() {
    Grid grid = null;
    DRFModel drfRebuilt = null;
    Frame fr = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove();
      Vec old = fr.remove("economy (mpg)");
      fr.add("economy (mpg)", old); // response to last column
      DKV.put(fr);

      // Setup random hyperparameter search space
      HashMap<String, Object[]> hyperParms = new HashMap<>();

      // Construct random grid search space
      long seed = System.nanoTime();
      Random rng = new Random(seed);

      // Limit to 1-3 randomly, 4 times.  Average total number of models is
      // 2^4, or 16.  Max is 81 models.
      Integer ntreesDim = rng.nextInt(3) + 1;
      Integer maxDepthDim = rng.nextInt(3) + 1;
      Integer mtriesDim = rng.nextInt(3) + 1;
      Integer sampleRateDim = rng.nextInt(3) + 1;

      Integer[] ntreesArr = interval(1, 15);
      ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr));
      Collections.shuffle(ntreesList);
      Integer[] ntreesSpace = new Integer[ntreesDim];
      for (int i = 0; i < ntreesDim; i++) {
        ntreesSpace[i] = ntreesList.get(i);
      }

      Integer[] maxDepthArr = interval(1, 10);
      ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr));
      Collections.shuffle(maxDepthList);
      Integer[] maxDepthSpace = new Integer[maxDepthDim];
      for (int i = 0; i < maxDepthDim; i++) {
        maxDepthSpace[i] = maxDepthList.get(i);
      }

      Integer[] mtriesArr = interval(1, 5);
      ArrayList<Integer> mtriesList = new ArrayList<>(Arrays.asList(mtriesArr));
      Collections.shuffle(mtriesList);
      Integer[] mtriesSpace = new Integer[mtriesDim];
      for (int i = 0; i < mtriesDim; i++) {
        mtriesSpace[i] = mtriesList.get(i);
      }

      Double[] sampleRateArr = interval(0.01, 0.99, 0.01);
      ArrayList<Double> sampleRateList = new ArrayList<>(Arrays.asList(sampleRateArr));
      Collections.shuffle(sampleRateList);
      Double[] sampleRateSpace = new Double[sampleRateDim];
      for (int i = 0; i < sampleRateDim; i++) {
        sampleRateSpace[i] = sampleRateList.get(i);
      }

      hyperParms.put("_ntrees", ntreesSpace);
      hyperParms.put("_max_depth", maxDepthSpace);
      hyperParms.put("_mtries", mtriesSpace);
      hyperParms.put("_sample_rate", sampleRateSpace);

      // Fire off a grid search
      DRFModel.DRFParameters params = new DRFModel.DRFParameters();
      params._train = fr._key;
      params._response_column = "economy (mpg)";
      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = gs.get();

      System.out.println("Test seed: " + seed);
      System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace));
      System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace));
      System.out.println("mtries search space: " + Arrays.toString(mtriesSpace));
      System.out.println("sample_rate search space: " + Arrays.toString(sampleRateSpace));

      // Check that cardinality of grid
      Model[] ms = grid.getModels();
      int numModels = ms.length;
      System.out.println("Grid consists of " + numModels + " models");
      assertEquals(
          "Number of models should match hyper space size",
          numModels,
          ntreesDim * maxDepthDim * sampleRateDim * mtriesDim + grid.getFailureCount());

      // Pick a random model from the grid
      HashMap<String, Object[]> randomHyperParms = new HashMap<>();

      Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)];
      randomHyperParms.put("_ntrees", new Integer[] {ntreeVal});

      Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)];
      randomHyperParms.put("_max_depth", maxDepthSpace);

      Integer mtriesVal = mtriesSpace[rng.nextInt(mtriesSpace.length)];
      randomHyperParms.put("_max_depth", mtriesSpace);

      Double sampleRateVal = sampleRateSpace[rng.nextInt(sampleRateSpace.length)];
      randomHyperParms.put("_sample_rate", sampleRateSpace);

      // TODO: DRFModel drfFromGrid = (DRFModel) g2.model(randomHyperParms).get();

      // Rebuild it with it's parameters
      params._ntrees = ntreeVal;
      params._max_depth = maxDepthVal;
      params._mtries = mtriesVal;
      drfRebuilt = new DRF(params).trainModel().get();

      // Make sure the MSE metrics match
      // double fromGridMSE = drfFromGrid._output._scored_train[drfFromGrid._output._ntrees]._mse;
      double rebuiltMSE = drfRebuilt._output._scored_train[drfRebuilt._output._ntrees]._mse;
      // System.out.println("The random grid model's MSE: " + fromGridMSE);
      System.out.println("The rebuilt model's MSE: " + rebuiltMSE);
      // assertEquals(fromGridMSE, rebuiltMSE);

    } finally {
      if (fr != null) {
        fr.remove();
      }
      if (grid != null) {
        grid.remove();
      }
      if (drfRebuilt != null) {
        drfRebuilt.remove();
      }
    }
  }
Beispiel #19
0
 /** Return {@link Response} for finished job. */
 @Override
 protected Response jobDone(final Job job, final Key dst) {
   JsonObject args = new JsonObject();
   args.addProperty(MODEL_KEY, job.dest().toString());
   return DRFModelView.redirect(this, job.dest());
 }
Beispiel #20
0
  // --------------------------------------------------------------------------
  // Build the next k-trees, which is trying to correct the residual error from
  // the prior trees.  From LSE2, page 387.  Step 2b ii, iii.
  private DTree[] buildNextKTrees(Frame fr) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];

    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols];

    for (int k = 0; k < _nclass; k++) {
      // Initially setup as-if an empty-split had just happened
      if (_distribution == null || _distribution[k] != 0) {
        // The Boolean Optimization
        // This optimization assumes the 2nd tree of a 2-class system is the
        // inverse of the first.  This is false for DRF (and true for GBM) -
        // DRF picks a random different set of columns for the 2nd tree.
        if (k == 1 && _nclass == 2) continue;
        ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows);
        new GBMUndecidedNode(
            ktrees[k],
            -1,
            DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], false)); // The "root" node
      }
    }
    int[] leafs = new int[_nclass]; // Define a "working set" of leaf splits, from here to tree._len

    // ----
    // ESL2, page 387.  Step 2b ii.
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    int depth = 0;
    for (; depth < max_depth; depth++) {
      if (!Job.isRunning(self())) return null;

      hcs = buildLayer(fr, ktrees, leafs, hcs, false, false);

      // If we did not make any new splits, then the tree is split-to-death
      if (hcs == null) break;
    }

    // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
    // LeafNodes to hold predictions.
    for (int k = 0; k < _nclass; k++) {
      DTree tree = ktrees[k];
      if (tree == null) continue;
      int leaf = leafs[k] = tree.len();
      for (int nid = 0; nid < leaf; nid++) {
        if (tree.node(nid) instanceof DecidedNode) {
          DecidedNode dn = tree.decided(nid);
          for (int i = 0; i < dn._nids.length; i++) {
            int cnid = dn._nids[i];
            if (cnid == -1
                || // Bottomed out (predictors or responses known constant)
                tree.node(cnid) instanceof UndecidedNode
                || // Or chopped off for depth
                (tree.node(cnid) instanceof DecidedNode
                    && // Or not possible to split
                    ((DecidedNode) tree.node(cnid))._split.col() == -1))
              dn._nids[i] = new GBMLeafNode(tree, nid).nid(); // Mark a leaf here
          }
          // Handle the trivial non-splitting tree
          if (nid == 0 && dn._split.col() == -1) new GBMLeafNode(tree, -1, 0);
        }
      }
    } // -- k-trees are done

    // ----
    // ESL2, page 387.  Step 2b iii.  Compute the gammas, and store them back
    // into the tree leaves.  Includes learn_rate.
    //    gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))
    // For regression:
    //    gamma_i_k = sum res_i / count(res_i)
    GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr);
    double m1class = _nclass > 1 ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k];
      if (tree == null) continue;
      for (int i = 0; i < tree._len - leafs[k]; i++) {
        double g =
            gp._gss[k][i] == 0 // Constant response?
                ? (gp._rss[k][i] == 0
                    ? 0
                    : 1000) // Cap (exponential) learn, instead of dealing with Inf
                : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i];
        assert !Double.isNaN(g);
        ((LeafNode) tree.node(leafs[k] + i))._pred = g;
      }
    }

    // ----
    // ESL2, page 387.  Step 2b iv.  Cache the sum of all the trees, plus the
    // new tree, in the 'tree' columns.  Also, zap the NIDs for next pass.
    // Tree <== f(Tree)
    // Nids <== 0
    new MRTask2() {
      @Override
      public void map(Chunk chks[]) {
        // For all tree/klasses
        for (int k = 0; k < _nclass; k++) {
          final DTree tree = ktrees[k];
          if (tree == null) continue;
          final Chunk nids = chk_nids(chks, k);
          final Chunk ct = chk_tree(chks, k);
          for (int row = 0; row < nids._len; row++) {
            int nid = (int) nids.at80(row);
            if (nid < 0) continue;
            ct.set0(row, (float) (ct.at0(row) + ((LeafNode) tree.node(nid))._pred));
            nids.set0(row, 0);
          }
        }
      }
    }.doAll(fr);

    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++)
      if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i];
    // DEBUG: Print the generated K trees
    // printGenerateTrees(ktrees);

    return ktrees;
  }
Beispiel #21
0
  /**
   * Extracts the values, applies regularization to numerics, adds appropriate offsets to
   * categoricals, and adapts response according to the CaseMode/CaseValue if set.
   */
  @Override
  public final void map(Chunk[] chunks, NewChunk[] outputs) {
    if (_job != null && _job.self() != null && !Job.isRunning(_job.self()))
      throw new JobCancelledException();
    final int nrows = chunks[0]._len;
    final long offset = chunks[0]._start;
    chunkInit();
    double[] nums = MemoryManager.malloc8d(_dinfo._nums);
    int[] cats = MemoryManager.malloc4(_dinfo._cats);
    double[] response = MemoryManager.malloc8d(_dinfo._responses);
    int start = 0;
    int end = nrows;

    boolean contiguous = false;
    Random skip_rng = null; // random generator for skipping rows
    if (_useFraction < 1.0) {
      skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong());
      if (contiguous) {
        final int howmany = (int) Math.ceil(_useFraction * nrows);
        if (howmany > 0) {
          start = skip_rng.nextInt(nrows - howmany);
          end = start + howmany;
        }
        assert (start < nrows);
        assert (end <= nrows);
      }
    }

    long[] shuf_map = null;
    if (_shuffle) {
      shuf_map = new long[end - start];
      for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i;
      Utils.shuffleArray(shuf_map, new Random().nextLong());
    }

    OUTER:
    for (int rr = start; rr < end; ++rr) {
      final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr;
      if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId)
          || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue;
      for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs!
      int i = 0, ncats = 0;
      for (; i < _dinfo._cats; ++i) {
        int c = (int) chunks[i].at80(r);
        if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1;
      }
      final int n = chunks.length - _dinfo._responses;
      for (; i < n; ++i) {
        double d = chunks[i].at0(r);
        if (_dinfo._normMul != null)
          d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats];
        nums[i - _dinfo._cats] = d;
      }
      for (i = 0; i < _dinfo._responses; ++i) {
        response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r);
        if (_dinfo._normRespMul != null)
          response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i];
      }
      if (outputs != null && outputs.length > 0)
        processRow(offset + r, nums, ncats, cats, response, outputs);
      else processRow(offset + r, nums, ncats, cats, response);
    }
    chunkDone();
  }
Beispiel #22
0
 @Override
 public float progress() {
   double d = 0.1;
   for (Job job : jobs) d += job.progress();
   return Math.min(1f, (float) (d / jobs.length));
 }
Beispiel #23
0
 @Override public void remove() {
   super.remove();
   UKV.remove(_progress);
 }
Beispiel #24
0
  // --------------------------------------------------------------------------
  // Build the next random k-trees represeint tid-th tree
  private DTree[] buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random rand, int tid) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];

    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram hcs[][][] = new DHistogram[_nclass][1 /*just root leaf*/][_ncols];

    // Use for all k-trees the same seed. NOTE: this is only to make a fair
    // view for all k-trees
    long rseed = rand.nextLong();
    // Initially setup as-if an empty-split had just happened
    for (int k = 0; k < _nclass; k++) {
      assert (_distribution != null && classification)
          || (_distribution == null && !classification);
      if (_distribution == null || _distribution[k] != 0) { // Ignore missing classes
        // The Boolean Optimization
        // This optimization assumes the 2nd tree of a 2-class system is the
        // inverse of the first.  This is false for DRF (and true for GBM) -
        // DRF picks a random different set of columns for the 2nd tree.
        // if( k==1 && _nclass==2 ) continue;
        ktrees[k] = new DRFTree(fr, _ncols, (char) nbins, (char) _nclass, min_rows, mtrys, rseed);
        boolean isBinom = classification;
        new DRFUndecidedNode(
            ktrees[k],
            -1,
            DHistogram.initialHist(fr, _ncols, nbins, hcs[k][0], isBinom)); // The "root" node
      }
    }

    // Sample - mark the lines by putting 'OUT_OF_BAG' into nid(<klass>) vector
    Timer t_1 = new Timer();
    Sample ss[] = new Sample[_nclass];
    for (int k = 0; k < _nclass; k++)
      if (ktrees[k] != null)
        ss[k] =
            new Sample((DRFTree) ktrees[k], sample_rate)
                .dfork(0, new Frame(vec_nids(fr, k), vec_resp(fr, k)), build_tree_one_node);
    for (int k = 0; k < _nclass; k++) if (ss[k] != null) ss[k].getResult();
    Log.debug(Sys.DRF__, "Sampling took: + " + t_1);

    int[] leafs =
        new int
            [_nclass]; // Define a "working set" of leaf splits, from leafs[i] to tree._len for each
                       // tree i

    // ----
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    Timer t_2 = new Timer();
    int depth = 0;
    for (; depth < max_depth; depth++) {
      if (!Job.isRunning(self())) return null;

      hcs = buildLayer(fr, ktrees, leafs, hcs, true, build_tree_one_node);

      // If we did not make any new splits, then the tree is split-to-death
      if (hcs == null) break;
    }
    Log.debug(Sys.DRF__, "Tree build took: " + t_2);

    // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
    // LeafNodes to hold predictions.
    Timer t_3 = new Timer();
    for (int k = 0; k < _nclass; k++) {
      DTree tree = ktrees[k];
      if (tree == null) continue;
      int leaf = leafs[k] = tree.len();
      for (int nid = 0; nid < leaf; nid++) {
        if (tree.node(nid) instanceof DecidedNode) {
          DecidedNode dn = tree.decided(nid);
          for (int i = 0; i < dn._nids.length; i++) {
            int cnid = dn._nids[i];
            if (cnid == -1
                || // Bottomed out (predictors or responses known constant)
                tree.node(cnid) instanceof UndecidedNode
                || // Or chopped off for depth
                (tree.node(cnid) instanceof DecidedNode
                    && // Or not possible to split
                    ((DecidedNode) tree.node(cnid))._split.col() == -1)) {
              LeafNode ln = new DRFLeafNode(tree, nid);
              ln._pred = dn.pred(i); // Set prediction into the leaf
              dn._nids[i] = ln.nid(); // Mark a leaf here
            }
          }
          // Handle the trivial non-splitting tree
          if (nid == 0 && dn._split.col() == -1) new DRFLeafNode(tree, -1, 0);
        }
      }
    } // -- k-trees are done
    Log.debug(Sys.DRF__, "Nodes propagation: " + t_3);

    // ----
    // Move rows into the final leaf rows
    Timer t_4 = new Timer();
    CollectPreds cp = new CollectPreds(ktrees, leafs).doAll(fr, build_tree_one_node);
    if (importance) {
      if (classification)
        asVotes(_treeMeasuresOnOOB)
            .append(cp.rightVotes, cp.allRows); // Track right votes over OOB rows for this tree
      else /* regression */ asSSE(_treeMeasuresOnOOB).append(cp.sse, cp.allRows);
    }
    Log.debug(Sys.DRF__, "CollectPreds done: " + t_4);

    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++)
      if (ktrees[i] != null) ktrees[i].leaves = ktrees[i].len() - leafs[i];
    // DEBUG: Print the generated K trees
    // printGenerateTrees(ktrees);

    return ktrees;
  }
Beispiel #25
0
  /**
   * Extracts the values, applies regularization to numerics, adds appropriate offsets to
   * categoricals, and adapts response according to the CaseMode/CaseValue if set.
   */
  @Override
  public final void map(Chunk[] chunks, NewChunk[] outputs) {
    if (_jobKey != null && !Job.isRunning(_jobKey)) throw new JobCancelledException();
    final int nrows = chunks[0]._len;
    final long offset = chunks[0].start();
    boolean doWork = chunkInit();
    if (!doWork) return;
    final boolean obs_weights = _dinfo._weights && !_fr.vecs()[_dinfo.weightChunkId()].isConst();
    final double global_weight_sum =
        obs_weights ? _fr.vecs()[_dinfo.weightChunkId()].mean() * _fr.numRows() : 0;

    DataInfo.Row row = _dinfo.newDenseRow();
    double[] weight_map = null;
    double relative_chunk_weight = 1;
    // TODO: store node-local helper arrays in _dinfo -> avoid re-allocation and construction
    if (obs_weights) {
      weight_map = new double[nrows];
      double weight_sum = 0;
      for (int i = 0; i < nrows; ++i) {
        row = _dinfo.extractDenseRow(chunks, i, row);
        weight_sum += row.weight;
        weight_map[i] = weight_sum;
        assert (i == 0 || row.weight == 0 || weight_map[i] > weight_map[i - 1]);
      }
      if (weight_sum > 0) {
        ArrayUtils.div(weight_map, weight_sum); // normalize to 0...1
        relative_chunk_weight = global_weight_sum * nrows / _fr.numRows() / weight_sum;
      } else return; // nothing to do here - all rows have 0 weight
    }

    // Example:
    // _useFraction = 0.8 -> 1 repeat with fraction = 0.8
    // _useFraction = 1.0 -> 1 repeat with fraction = 1.0
    // _useFraction = 1.1 -> 2 repeats with fraction = 0.55
    // _useFraction = 2.1 -> 3 repeats with fraction = 0.7
    // _useFraction = 3.0 -> 3 repeats with fraction = 1.0
    final int repeats = (int) Math.ceil(_useFraction * relative_chunk_weight);
    final float fraction = (float) (_useFraction * relative_chunk_weight) / repeats;
    assert (fraction <= 1.0);

    final boolean sample = (fraction < 0.999 || obs_weights || _shuffle);
    final Random skip_rng =
        sample
            ? RandomUtils.getRNG(
                (0x8734093502429734L + _seed + offset) * (_iteration + 0x9823423497823423L))
            : null;

    long num_processed_rows = 0;
    for (int rep = 0; rep < repeats; ++rep) {
      for (int row_idx = 0; row_idx < nrows; ++row_idx) {
        int r = sample ? -1 : 0;
        // only train with a given number of training samples (fraction*nrows)
        if (sample && !obs_weights && skip_rng.nextDouble() > fraction) continue;
        if (obs_weights
            && num_processed_rows % 2
                == 0) { // every second row is randomly sampled -> that way we won't "forget" rare
          // rows
          // importance sampling based on inverse of cumulative distribution
          double key = skip_rng.nextDouble();
          r = Arrays.binarySearch(weight_map, 0, nrows, key);
          //          Log.info(Arrays.toString(weight_map));
          //          Log.info("key: " + key + " idx: " + (r >= 0 ? r : (-r-1)));
          if (r < 0) r = -r - 1;
          assert (r == 0 || weight_map[r] > weight_map[r - 1]);
        } else if (r == -1) {
          do {
            r = skip_rng.nextInt(nrows); // random sampling (with replacement)
          }
          // if we have weights, and we did the %2 skipping above, then we need to find an alternate
          // row with non-zero weight
          while (obs_weights
              && ((r == 0 && weight_map[0] == 0) || (r > 0 && weight_map[r] == weight_map[r - 1])));
        } else {
          assert (!obs_weights);
          r = row_idx; // linear scan - slightly faster
        }
        assert (r >= 0 && r <= nrows);

        row = _dinfo.extractDenseRow(chunks, r, row);
        if (!row.bad) {
          assert (row.weight
              > 0); // check that we never process a row that was held out via row.weight = 0
          long seed = offset + rep * nrows + r;
          if (outputs != null && outputs.length > 0) processRow(seed++, row, outputs);
          else processRow(seed++, row);
        }
        num_processed_rows++;
      }
    }
    assert (fraction != 1 || num_processed_rows == repeats * nrows);
    chunkDone(num_processed_rows);
  }
Beispiel #26
0
  // @Ignore("PUBDEV-1648")
  @Test
  public void testRandomCarsGrid() {
    Grid grid = null;
    GBMModel gbmRebuilt = null;
    Frame fr = null;
    Vec old = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove();
      old = fr.remove("economy (mpg)");

      fr.add("economy (mpg)", old); // response to last column
      DKV.put(fr);

      // Setup random hyperparameter search space
      HashMap<String, Object[]> hyperParms = new HashMap<>();
      hyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian});

      // Construct random grid search space
      Random rng = new Random();

      Integer ntreesDim = rng.nextInt(4) + 1;
      Integer maxDepthDim = rng.nextInt(4) + 1;
      Integer learnRateDim = rng.nextInt(4) + 1;

      Integer[] ntreesArr = interval(1, 25);
      ArrayList<Integer> ntreesList = new ArrayList<>(Arrays.asList(ntreesArr));
      Collections.shuffle(ntreesList);
      Integer[] ntreesSpace = new Integer[ntreesDim];
      for (int i = 0; i < ntreesDim; i++) {
        ntreesSpace[i] = ntreesList.get(i);
      }

      Integer[] maxDepthArr = interval(1, 10);
      ArrayList<Integer> maxDepthList = new ArrayList<>(Arrays.asList(maxDepthArr));
      Collections.shuffle(maxDepthList);
      Integer[] maxDepthSpace = new Integer[maxDepthDim];
      for (int i = 0; i < maxDepthDim; i++) {
        maxDepthSpace[i] = maxDepthList.get(i);
      }

      Double[] learnRateArr = interval(0.01, 1.0, 0.01);
      ArrayList<Double> learnRateList = new ArrayList<>(Arrays.asList(learnRateArr));
      Collections.shuffle(learnRateList);
      Double[] learnRateSpace = new Double[learnRateDim];
      for (int i = 0; i < learnRateDim; i++) {
        learnRateSpace[i] = learnRateList.get(i);
      }

      hyperParms.put("_ntrees", ntreesSpace);
      hyperParms.put("_max_depth", maxDepthSpace);
      hyperParms.put("_learn_rate", learnRateSpace);

      // Fire off a grid search
      GBMModel.GBMParameters params = new GBMModel.GBMParameters();
      params._train = fr._key;
      params._response_column = "economy (mpg)";
      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = gs.get();

      System.out.println("ntrees search space: " + Arrays.toString(ntreesSpace));
      System.out.println("max_depth search space: " + Arrays.toString(maxDepthSpace));
      System.out.println("learn_rate search space: " + Arrays.toString(learnRateSpace));

      // Check that cardinality of grid
      Model[] ms = grid.getModels();
      Integer numModels = ms.length;
      System.out.println("Grid consists of " + numModels + " models");
      assertTrue(numModels == ntreesDim * maxDepthDim * learnRateDim);

      // Pick a random model from the grid
      HashMap<String, Object[]> randomHyperParms = new HashMap<>();
      randomHyperParms.put("_distribution", new DistributionFamily[] {DistributionFamily.gaussian});

      Integer ntreeVal = ntreesSpace[rng.nextInt(ntreesSpace.length)];
      randomHyperParms.put("_ntrees", new Integer[] {ntreeVal});

      Integer maxDepthVal = maxDepthSpace[rng.nextInt(maxDepthSpace.length)];
      randomHyperParms.put("_max_depth", maxDepthSpace);

      Double learnRateVal = learnRateSpace[rng.nextInt(learnRateSpace.length)];
      randomHyperParms.put("_learn_rate", learnRateSpace);

      // TODO: GBMModel gbmFromGrid = (GBMModel) g2.model(randomHyperParms).get();

      // Rebuild it with it's parameters
      params._distribution = DistributionFamily.gaussian;
      params._ntrees = ntreeVal;
      params._max_depth = maxDepthVal;
      params._learn_rate = learnRateVal;
      GBM gbm = new GBM(params);
      gbmRebuilt = gbm.trainModel().get();
      assertTrue(gbm.isStopped());

      // Make sure the MSE metrics match
      // double fromGridMSE = gbmFromGrid._output._scored_train[gbmFromGrid._output._ntrees]._mse;
      double rebuiltMSE = gbmRebuilt._output._scored_train[gbmRebuilt._output._ntrees]._mse;
      // System.out.println("The random grid model's MSE: " + fromGridMSE);
      System.out.println("The rebuilt model's MSE: " + rebuiltMSE);
      // assertEquals(fromGridMSE, rebuiltMSE);

    } finally {
      if (old != null) old.remove();
      if (fr != null) fr.remove();
      if (grid != null) grid.remove();
      if (gbmRebuilt != null) gbmRebuilt.remove();
    }
  }
Beispiel #27
0
  @Override
  protected Response serve() {
    int tasks = 0;
    int finished = 0;
    RFModel model = _modelKey.value();
    double[] weights = _weights.value();
    // Finish refresh after rf model is done and confusion matrix for all trees is computed
    boolean done = false;
    int classCol = _classCol.specified() ? _classCol.value() : findResponseIdx(model);

    tasks = model._totalTrees;
    finished = model.size();

    // Handle cancelled/aborted jobs
    if (_job.value() != null) {
      Job jjob = Job.findJob(_job.value());
      if (jjob != null && jjob.isCancelled())
        return Response.error(
            jjob.exception == null ? "Job was cancelled by user!" : jjob.exception);
    }

    JsonObject response = defaultJsonResponse();
    // CM return and possible computation is requested
    if (!_noCM.value() && (finished == tasks || _iterativeCM.value()) && finished > 0) {
      // Compute the highest number of trees which is less then a threshold
      int modelSize = tasks * _refreshThresholdCM.value() / 100;
      modelSize =
          modelSize == 0 || finished == tasks ? finished : modelSize * (finished / modelSize);

      // Get the computing the matrix - if no job is computing, then start a new job
      Job cmJob =
          ConfusionTask.make(
              model, modelSize, _dataKey.value()._key, classCol, weights, _oobee.value());
      // Here the the job is running - it saved a CM which can be already finished or in invalid
      // state.
      CMFinal confusion = UKV.get(cmJob.dest());
      // if the matrix is valid, report it in the JSON
      if (confusion != null && confusion.valid() && modelSize > 0) {
        // finished += 1;
        JsonObject cm = new JsonObject();
        JsonArray cmHeader = new JsonArray();
        JsonArray matrix = new JsonArray();
        cm.addProperty(JSON_CM_TYPE, _oobee.value() ? "OOB error estimate" : "full scoring");
        cm.addProperty(JSON_CM_CLASS_ERR, confusion.classError());
        cm.addProperty(JSON_CM_ROWS_SKIPPED, confusion.skippedRows());
        cm.addProperty(JSON_CM_ROWS, confusion.rows());
        // create the header
        for (String s : cfDomain(confusion, 1024)) cmHeader.add(new JsonPrimitive(s));
        cm.add(JSON_CM_HEADER, cmHeader);
        // add the matrix
        final int nclasses = confusion.dimension();
        JsonArray classErrors = new JsonArray();
        for (int crow = 0; crow < nclasses; ++crow) {
          JsonArray row = new JsonArray();
          int classHitScore = 0;
          for (int ccol = 0; ccol < nclasses; ++ccol) {
            row.add(new JsonPrimitive(confusion.matrix(crow, ccol)));
            if (crow != ccol) classHitScore += confusion.matrix(crow, ccol);
          }
          // produce infinity members in case of 0.f/0
          classErrors.add(
              new JsonPrimitive(
                  (float) classHitScore / (classHitScore + confusion.matrix(crow, crow))));
          matrix.add(row);
        }
        cm.add(JSON_CM_CLASSES_ERRORS, classErrors);
        cm.add(JSON_CM_MATRIX, matrix);
        cm.addProperty(JSON_CM_TREES, modelSize);
        response.add(JSON_CM, cm);
        // Signal end only and only if all trees were generated and confusion matrix is valid
        done = finished == tasks;
      }
    } else if (_noCM.value() && finished == tasks) done = true;

    // Trees
    JsonObject trees = new JsonObject();
    trees.addProperty(Constants.TREE_COUNT, model.size());
    if (model.size() > 0) {
      trees.add(Constants.TREE_DEPTH, model.depth().toJson());
      trees.add(Constants.TREE_LEAVES, model.leaves().toJson());
    }
    response.add(Constants.TREES, trees);

    // Build a response
    Response r;
    if (done) {
      r = jobDone(response);
      r.addHeader(
          "<div class='alert'>"
              + /*RFScore.link(MODEL_KEY, model._key, "Use this model for scoring.") */ GeneratePredictionsPage
                  .link(model._key, "Predict!")
              + " </div>");
    } else {
      r = Response.poll(response, finished, tasks);
    }
    r.setBuilder(JSON_CM, new ConfusionMatrixBuilder());
    r.setBuilder(TREES, new TreeListBuilder());
    return r;
  }
Beispiel #28
0
  @Test
  public void testCarsGrid() {
    Grid<GBMModel.GBMParameters> grid = null;
    Frame fr = null;
    Vec old = null;
    try {
      fr = parse_test_file("smalldata/junit/cars.csv");
      fr.remove("name").remove(); // Remove unique id
      old = fr.remove("cylinders");
      fr.add("cylinders", old.toCategoricalVec()); // response to last column
      DKV.put(fr);

      // Setup hyperparameter search space
      final Double[] legalLearnRateOpts = new Double[] {0.01, 0.1, 0.3};
      final Double[] illegalLearnRateOpts = new Double[] {-1.0};
      HashMap<String, Object[]> hyperParms =
          new HashMap<String, Object[]>() {
            {
              put("_ntrees", new Integer[] {1, 2});
              put("_distribution", new DistributionFamily[] {DistributionFamily.multinomial});
              put("_max_depth", new Integer[] {1, 2, 5});
              put("_learn_rate", ArrayUtils.join(legalLearnRateOpts, illegalLearnRateOpts));
            }
          };

      // Name of used hyper parameters
      String[] hyperParamNames = hyperParms.keySet().toArray(new String[hyperParms.size()]);
      Arrays.sort(hyperParamNames);
      int hyperSpaceSize = ArrayUtils.crossProductSize(hyperParms);

      // Fire off a grid search
      GBMModel.GBMParameters params = new GBMModel.GBMParameters();
      params._train = fr._key;
      params._response_column = "cylinders";
      // Get the Grid for this modeling class and frame
      Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
      grid = (Grid<GBMModel.GBMParameters>) gs.get();
      // Make sure number of produced models match size of specified hyper space
      Assert.assertEquals(
          "Size of grid (models+failures) should match to size of hyper space",
          hyperSpaceSize,
          grid.getModelCount() + grid.getFailureCount());
      //
      // Make sure that names of used parameters match
      //
      String[] gridHyperNames = grid.getHyperNames();
      Arrays.sort(gridHyperNames);
      Assert.assertArrayEquals(
          "Hyper parameters names should match!", hyperParamNames, gridHyperNames);

      //
      // Make sure that values of used parameters match as well to the specified values
      //
      Key<Model>[] mKeys = grid.getModelKeys();
      Map<String, Set<Object>> usedHyperParams = GridTestUtils.initMap(hyperParamNames);
      for (Key<Model> mKey : mKeys) {
        GBMModel gbm = (GBMModel) mKey.get();
        System.out.println(
            gbm._output._scored_train[gbm._output._ntrees]._mse
                + " "
                + Arrays.deepToString(
                    ArrayUtils.zip(grid.getHyperNames(), grid.getHyperValues(gbm._parms))));
        GridTestUtils.extractParams(usedHyperParams, gbm._parms, hyperParamNames);
      }
      // Remove illegal options
      hyperParms.put("_learn_rate", legalLearnRateOpts);
      GridTestUtils.assertParamsEqual(
          "Grid models parameters have to cover specified hyper space",
          hyperParms,
          usedHyperParams);

      // Verify model failure
      Map<String, Set<Object>> failedHyperParams = GridTestUtils.initMap(hyperParamNames);
      ;
      for (Model.Parameters failedParams : grid.getFailedParameters()) {
        GridTestUtils.extractParams(failedHyperParams, failedParams, hyperParamNames);
      }
      hyperParms.put("_learn_rate", illegalLearnRateOpts);
      GridTestUtils.assertParamsEqual(
          "Failed model parameters have to correspond to specified hyper space",
          hyperParms,
          failedHyperParams);

    } finally {
      if (old != null) {
        old.remove();
      }
      if (fr != null) {
        fr.remove();
      }
      if (grid != null) {
        grid.remove();
      }
    }
  }
Beispiel #29
0
 /**
  * Check if given job is running.
  *
  * @param job_key job key
  * @return true if job is still running else returns false.
  */
 public static boolean isRunning(Key job_key) {
   Job j = UKV.get(job_key);
   assert j != null : "Job should be always in DKV!";
   return j.isRunning();
 }