예제 #1
0
파일: Job.java 프로젝트: chouclee/h2o
 public void onException(Throwable ex) {
   UKV.remove(dest());
   Value v = DKV.get(progressKey());
   if (v != null) {
     ChunkProgress p = v.get();
     p = p.error(ex.getMessage());
     DKV.put(progressKey(), p);
   }
   cancel(ex);
 }
예제 #2
0
 private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) {
   try {
     if (fs == null) return;
     for (FileStatus file : fs.listStatus(p)) {
       Path pfs = file.getPath();
       if (file.isDir()) {
         addFolder(fs, pfs, succeeded, failed);
       } else {
         Key k = Key.make(pfs.toString());
         long size = file.getLen();
         Value val = null;
         if (pfs.getName().endsWith(Extensions.JSON)) {
           JsonParser parser = new JsonParser();
           JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject();
           JsonElement v = json.get(Constants.VERSION);
           if (v == null) throw new RuntimeException("Missing version");
           JsonElement type = json.get(Constants.TYPE);
           if (type == null) throw new RuntimeException("Missing type");
           Class c = Class.forName(type.getAsString());
           OldModel model = (OldModel) c.newInstance();
           model.fromJson(json);
         } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
           FSDataInputStream s = fs.open(pfs);
           int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg
           byte[] mem = MemoryManager.malloc1(sz);
           s.readFully(mem);
           // Convert to a ValueArray (hope it fits in 1Meg!)
           ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem));
           val = new Value(k, ary, Value.HDFS);
         } else if (size >= 2 * ValueArray.CHUNK_SZ) {
           val =
               new Value(
                   k,
                   new ValueArray(k, size),
                   Value.HDFS); // ValueArray byte wrapper over a large file
         } else {
           val = new Value(k, (int) size, Value.HDFS); // Plain Value
           val.setdsk();
         }
         DKV.put(k, val);
         Log.info("PersistHdfs: DKV.put(" + k + ")");
         JsonObject o = new JsonObject();
         o.addProperty(Constants.KEY, k.toString());
         o.addProperty(Constants.FILE, pfs.toString());
         o.addProperty(Constants.VALUE_SIZE, file.getLen());
         succeeded.add(o);
       }
     }
   } catch (Exception e) {
     Log.err(e);
     JsonObject o = new JsonObject();
     o.addProperty(Constants.FILE, p.toString());
     o.addProperty(Constants.ERROR, e.getMessage());
     failed.add(o);
   }
 }
예제 #3
0
  /**
   * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This
   * call is expected to be overridden in the subclasses and each subclass will start with
   * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to
   * be fast; heavy-weight prep needs to wait for the trainModel() call.
   *
   * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to
   * predict on; validate a checkpoint.
   */
  @Override
  public void init(boolean expensive) {
    super.init(expensive);
    if (H2O.ARGS.client && _parms._build_tree_one_node)
      error("_build_tree_one_node", "Cannot run on a single node in client mode");
    if (_vresponse != null) _vresponse_key = _vresponse._key;
    if (_response != null) _response_key = _response._key;
    if (_nclass > SharedTreeModel.SharedTreeParameters.MAX_SUPPORTED_LEVELS)
      error("_nclass", "Too many levels in response column!");

    if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0");

    if (_parms._ntrees < 0 || _parms._ntrees > 100000)
      error("_ntrees", "Requested ntrees must be between 1 and 100000");
    _ntrees = _parms._ntrees; // Total trees in final model
    if (_parms._checkpoint) { // Asking to continue from checkpoint?
      Value cv = DKV.get(_parms._model_id);
      if (cv != null) { // Look for prior model
        M checkpointModel = cv.get();
        if (_parms._ntrees < checkpointModel._output._ntrees + 1)
          error(
              "_ntrees",
              "Requested ntrees must be between "
                  + checkpointModel._output._ntrees
                  + 1
                  + " and 100000");
        _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees
      }
    }
    if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1.");
    if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16));
    if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1.");
    if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16));
    if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0.");
    if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0.");
    if (_parms._distribution == Distributions.Family.tweedie) {
      _parms._distribution.tweedie.p = _parms._tweedie_power;
    }
    if (_train != null) {
      double sumWeights =
          _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1);
      if (sumWeights
          < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once
      error(
            "_min_rows",
            "The dataset size is too small to split for min_rows="
                + _parms._min_rows
                + ": must have at least "
                + 2 * _parms._min_rows
                + " (weighted) rows, but have only "
                + sumWeights
                + ".");
    }
    if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols();
  }
예제 #4
0
  /**
   * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This
   * call is expected to be overridden in the subclasses and each subclass will start with
   * "super.init();". This call is made by the front-end whenever the GUI is clicked, and needs to
   * be fast; heavy-weight prep needs to wait for the trainModel() call.
   *
   * <p>Validate the requested ntrees; precompute actual ntrees. Validate the number of classes to
   * predict on; validate a checkpoint.
   */
  @Override
  public void init(boolean expensive) {
    super.init(expensive);
    if (H2O.ARGS.client && _parms._build_tree_one_node)
      error("_build_tree_one_node", "Cannot run on a single node in client mode");
    if (_vresponse != null) _vresponse_key = _vresponse._key;
    if (_response != null) _response_key = _response._key;

    if (_parms._min_rows < 0) error("_min_rows", "Requested min_rows must be greater than 0");

    if (_parms._ntrees < 0 || _parms._ntrees > MAX_NTREES)
      error("_ntrees", "Requested ntrees must be between 1 and " + MAX_NTREES);
    _ntrees = _parms._ntrees; // Total trees in final model
    if (_parms.hasCheckpoint()) { // Asking to continue from checkpoint?
      Value cv = DKV.get(_parms._checkpoint);
      if (cv != null) { // Look for prior model
        M checkpointModel = cv.get();
        try {
          _parms.validateWithCheckpoint(checkpointModel._parms);
        } catch (H2OIllegalArgumentException e) {
          error(e.values.get("argument").toString(), e.values.get("value").toString());
        }
        if (_parms._ntrees < checkpointModel._output._ntrees + 1)
          error(
              "_ntrees",
              "If checkpoint is specified then requested ntrees must be higher than "
                  + (checkpointModel._output._ntrees + 1));

        // Compute number of trees to build for this checkpoint
        _ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees
      }
    }
    if (_parms._nbins <= 1) error("_nbins", "_nbins must be > 1.");
    if (_parms._nbins >= 1 << 16) error("_nbins", "_nbins must be < " + (1 << 16));
    if (_parms._nbins_cats <= 1) error("_nbins_cats", "_nbins_cats must be > 1.");
    if (_parms._nbins_cats >= 1 << 16) error("_nbins_cats", "_nbins_cats must be < " + (1 << 16));
    if (_parms._max_depth <= 0) error("_max_depth", "_max_depth must be > 0.");
    if (_parms._min_rows <= 0) error("_min_rows", "_min_rows must be > 0.");
    if (_train != null) {
      double sumWeights =
          _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1);
      if (sumWeights
          < 2 * _parms._min_rows) // Need at least 2*min_rows weighted rows to split even once
      error(
            "_min_rows",
            "The dataset size is too small to split for min_rows="
                + _parms._min_rows
                + ": must have at least "
                + 2 * _parms._min_rows
                + " (weighted) rows, but have only "
                + sumWeights
                + ".");
    }
    if (_train != null) _ncols = _train.numCols() - 1 - numSpecialCols();
  }
  /**
   * TimeAveraging as part of Elastic Averaging Algorithm Cf. equation 6 of arXiv:1412.6651v5
   *
   * @param nodeAverageModel current average of per-node models
   * @return Time-average of node-averages (consensus model, "the" model)
   */
  public static DeepLearningModelInfo timeAverage(DeepLearningModelInfo nodeAverageModel) {
    float pa = (float) nodeAverageModel.get_params()._elastic_averaging_moving_rate;
    assert (pa > 0 && pa <= 1);
    DeepLearningModelInfo elasticAverage =
        DKV.getGet(nodeAverageModel.elasticAverageModelInfoKey()); // get latest version from DKV
    if (elasticAverage == null || pa == 1) {
      elasticAverage = nodeAverageModel.deep_clone();
    } else {
      nodeAverageModel.mult(pa);
      elasticAverage.mult(1 - pa);
      elasticAverage.add(nodeAverageModel); // ignore processed local value set here
      elasticAverage.set_processed_global(nodeAverageModel.get_processed_global());
    }
    elasticAverage.set_processed_local(0);
    DKV.put(elasticAverage.elasticAverageModelInfoKey(), elasticAverage);

    //    nodeAverageModel.computeStats();
    //    elasticAverage.computeStats();
    //    Log.info("Local Model    :\n" + nodeAverageModel.toString());
    //    Log.info("Elastic Average:\n" + elasticAverage.toString());
    return elasticAverage;
  }
예제 #6
0
 @Override
 protected void setupLocal() {
   _model_mem_size = 0;
   for (int i = 0; i < trees_so_far; ++i) {
     Key<CompressedTree>[] per_class = _treeKeys[i];
     for (int j = 0; j < per_class.length; ++j) {
       if (per_class[j] == null) continue;
       if (!per_class[j].home()) continue;
       // only look at homed tree keys
       _model_mem_size += DKV.get(per_class[j])._max;
     }
   }
 }
예제 #7
0
  // GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell
  // (2015), Section 5.3)
  private Frame reconstruct(
      Frame orig,
      Frame adaptedFr,
      Key destination_key,
      boolean save_imputed,
      boolean reverse_transform) {
    final int ncols = _output._names.length;
    assert ncols == adaptedFr.numCols();
    String prefix = "reconstr_";

    // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
    // Note: A is adapted to original training frame, P has columns shuffled so cats come before
    // nums!
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);
    String[][] adaptedDomme = adaptedFr.domains();
    for (int i = 0; i < ncols; i++) {
      Vec v = fullFrm.anyVec().makeZero();
      v.setDomain(adaptedDomme[i]);
      fullFrm.add(prefix + _output._names[i], v);
    }
    GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);

    // Return the imputed training frame
    int x = ncols + _parms._k, y = fullFrm.numCols();
    Frame f =
        fullFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs());
    DKV.put(f);
    gs._mb.makeModelMetrics(
        GLRMModel.this, orig, null, null); // save error metrics based on imputed data
    return f;
  }
예제 #8
0
 @Override
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   final Path p;
   if (_iceRoot != null) {
     p = new Path(_iceRoot, getIceName(v));
   } else {
     // Convert an arraylet chunk into a long-offset from the base file.
     if (k._kb[0] == Key.ARRAYLET_CHUNK) {
       skip = ValueArray.getChunkOffset(k); // The offset
       k = ValueArray.getArrayKey(k); // From the base file key
       if (k.toString().endsWith(Extensions.HEX)) { // Hex file?
         int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header?
         skip += value_len;
       }
     }
     p = new Path(k.toString());
   }
   final long skip_ = skip;
   run(
       new Callable() {
         @Override
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
             Utils.close(s);
           }
           return null;
         }
       },
       true,
       v._max);
   return b;
 }
예제 #9
0
파일: Job.java 프로젝트: chouclee/h2o
 /**
  * Helper to perform the generic part of cross validation Expected to be called from each
  * specific instance's crossValidate method
  *
  * @param splits Frames containing train/test splits
  * @param offsets Array to store the offsets of starting row indices for each cross-validation
  *     run
  * @param i Which fold of cross-validation to perform
  */
 protected final void genericCrossValidation(Frame[] splits, long[] offsets, int i) {
   int respidx = source.find(_responseName);
   assert (respidx != -1) : "response is not found in source!";
   job_key = Key.make(job_key.toString() + "_xval" + i); // make a new Job for CV
   assert (xval_models != null);
   destination_key = xval_models[i];
   source = splits[0];
   validation = splits[1];
   response = source.vecs()[respidx];
   n_folds = 0;
   state = Job.JobState.CREATED; // Hack to allow this job to run
   DKV.put(self(), this); // Needed to pass the Job.isRunning(cvdl.self()) check in FrameTask
   offsets[i + 1] = offsets[i] + validation.numRows();
   _cv =
       true; // Hack to allow init() to pass for ColumnsJob (allow cols/ignored_cols to co-exist)
   invoke();
 }
예제 #10
0
 /**
  * Performs deep clone of given model.
  *
  * <p>FIXME: fetch all data to the caller node
  */
 protected M getModelDeepClone(M model) {
   M newModel = IcedUtils.clone(model, _dest);
   // Do not clone model metrics
   newModel._output._model_metrics = new Key[0];
   newModel._output._training_metrics = null;
   newModel._output._validation_metrics = null;
   // Clone trees
   Key[][] treeKeys = newModel._output._treeKeys;
   for (int i = 0; i < treeKeys.length; i++) {
     for (int j = 0; j < treeKeys[i].length; j++) {
       if (treeKeys[i][j] == null) continue;
       ;
       CompressedTree ct = DKV.get(treeKeys[i][j]).get();
       CompressedTree newCt = IcedUtils.clone(ct, CompressedTree.makeTreeKey(i, j), true);
       treeKeys[i][j] = newCt._key;
     }
   }
   return newModel;
 }
예제 #11
0
 @Override
 public void map(Key key) {
   _rows = new long[_clusters.length];
   _dist = new double[_clusters.length];
   assert key.home();
   ValueArray va = DKV.get(_arykey).get();
   AutoBuffer bits = va.getChunk(key);
   int rows = va.rpc(ValueArray.getChunkIndex(key));
   double[] values = new double[_cols.length - 1];
   ClusterDist cd = new ClusterDist();
   for (int row = 0; row < rows; row++) {
     KMeans.datad(va, bits, row, _cols, _normalized, values);
     KMeans.closest(_clusters, values, cd);
     _rows[cd._cluster]++;
     _dist[cd._cluster] += cd._dist;
   }
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
예제 #12
0
  public ModelMetricsGLRM scoreMetricsOnly(Frame frame) {
    final int ncols = _output._names.length;

    // Need [A,X] where A = adapted test frame, X = loading frame
    // Note: A is adapted to original training frame
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();

    // Append loading frame X for calculating XY
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);

    GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm);
    ModelMetrics mm =
        gs._mb.makeModelMetrics(
            GLRMModel.this, adaptedFr, null, null); // save error metrics based on imputed data
    return (ModelMetricsGLRM) mm;
  }
예제 #13
0
파일: Job.java 프로젝트: chouclee/h2o
 private void cancel(final String msg, JobState resultingState) {
   if (resultingState == JobState.CANCELLED) {
     Log.info("Job " + self() + "(" + description + ") was cancelled.");
   } else {
     Log.err("Job " + self() + "(" + description + ") failed.");
     Log.err(msg);
   }
   exception = msg;
   state = resultingState;
   // replace finished job by a job handle
   replaceByJobHandle();
   DKV.write_barrier();
   final Job job = this;
   H2O.submitTask(
       new H2OCountedCompleter() {
         @Override
         public void compute2() {
           job.onCancelled();
         }
       });
 }
예제 #14
0
 /**
  * Creates a new ValueArray with classes. New ValueArray is not aligned with source one
  * unfortunately so have to send results to each chunk owner using Atomic.
  */
 @Override
 public void map(Key key) {
   assert key.home();
   if (Job.isRunning(_job.self())) {
     ValueArray va = DKV.get(_arykey).get();
     AutoBuffer bits = va.getChunk(key);
     long startRow = va.startRow(ValueArray.getChunkIndex(key));
     int rows = va.rpc(ValueArray.getChunkIndex(key));
     int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE);
     long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE);
     long updatedChk = chunk;
     long updatedRow = startRow;
     double[] values = new double[_cols.length - 1];
     ClusterDist cd = new ClusterDist();
     int[] clusters = new int[rows];
     int count = 0;
     for (int row = 0; row < rows; row++) {
       KMeans.datad(va, bits, row, _cols, _normalized, values);
       KMeans.closest(_clusters, values, cd);
       chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE);
       if (chunk != updatedChk) {
         updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow);
         updatedChk = chunk;
         updatedRow = startRow + row;
         count = 0;
       }
       clusters[count++] = cd._cluster;
     }
     if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow);
     _job.updateProgress(1);
   }
   _job = null;
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
예제 #15
0
    @Override
    protected void compute2() {
      CoxPHModel model = null;
      try {
        Scope.enter();
        _parms.read_lock_frames(CoxPH.this);
        init(true);

        applyScoringFrameSideEffects();

        // The model to be built
        model = new CoxPHModel(dest(), _parms, new CoxPHModel.CoxPHOutput(CoxPH.this));
        model.delete_and_lock(_key);

        applyTrainingFrameSideEffects();

        int nResponses = 1;
        boolean useAllFactorLevels = false;
        final DataInfo dinfo =
            new DataInfo(
                Key.make(),
                _modelBuilderTrain,
                null,
                nResponses,
                useAllFactorLevels,
                DataInfo.TransformType.DEMEAN,
                TransformType.NONE,
                true,
                false,
                false,
                false,
                false,
                false);
        initStats(model, dinfo);

        final int n_offsets =
            (model._parms.offset_columns == null) ? 0 : model._parms.offset_columns.length;
        final int n_coef = dinfo.fullN() - n_offsets;
        final double[] step = MemoryManager.malloc8d(n_coef);
        final double[] oldCoef = MemoryManager.malloc8d(n_coef);
        final double[] newCoef = MemoryManager.malloc8d(n_coef);
        Arrays.fill(step, Double.NaN);
        Arrays.fill(oldCoef, Double.NaN);
        for (int j = 0; j < n_coef; ++j) newCoef[j] = model._parms.init;
        double oldLoglik = -Double.MAX_VALUE;
        final int n_time = (int) (model._output.max_time - model._output.min_time + 1);
        final boolean has_start_column = (model._parms.start_column != null);
        final boolean has_weights_column = (model._parms.weights_column != null);
        for (int i = 0; i <= model._parms.iter_max; ++i) {
          model._output.iter = i;

          final CoxPHTask coxMR =
              new CoxPHTask(
                      self(),
                      dinfo,
                      newCoef,
                      model._output.min_time,
                      n_time,
                      n_offsets,
                      has_start_column,
                      has_weights_column)
                  .doAll(dinfo._adaptedFrame);

          final double newLoglik = calcLoglik(model, coxMR);
          if (newLoglik > oldLoglik) {
            if (i == 0) calcCounts(model, coxMR);

            calcModelStats(model, newCoef, newLoglik);
            calcCumhaz_0(model, coxMR);

            if (newLoglik == 0) model._output.lre = -Math.log10(Math.abs(oldLoglik - newLoglik));
            else model._output.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik));
            if (model._output.lre >= model._parms.lre_min) break;

            Arrays.fill(step, 0);
            for (int j = 0; j < n_coef; ++j)
              for (int k = 0; k < n_coef; ++k)
                step[j] -= model._output.var_coef[j][k] * model._output.gradient[k];
            for (int j = 0; j < n_coef; ++j)
              if (Double.isNaN(step[j]) || Double.isInfinite(step[j])) break;

            oldLoglik = newLoglik;
            System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length);
          } else {
            for (int j = 0; j < n_coef; ++j) step[j] /= 2;
          }

          for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j];
        }

        model.update(_key);
      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        updateModelOutput();
        _parms.read_unlock_frames(CoxPH.this);
        Scope.exit();
        done(); // Job done!
      }
      tryComplete();
    }
예제 #16
0
    // Main worker thread
    @Override
    protected void compute2() {

      KMeansModel model = null;
      try {
        init(true);
        // Do lock even before checking the errors, since this block is finalized by unlock
        // (not the best solution, but the code is more readable)
        _parms.read_lock_frames(KMeans.this); // Fetch & read-lock input frames
        // Something goes wrong
        if (error_count() > 0)
          throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(KMeans.this);
        // The model to be built
        model = new KMeansModel(dest(), _parms, new KMeansModel.KMeansOutput(KMeans.this));
        model.delete_and_lock(_key);

        //
        final Vec vecs[] = _train.vecs();
        // mults & means for standardization
        final double[] means = _train.means(); // means are used to impute NAs
        final double[] mults = _parms._standardize ? _train.mults() : null;
        final int[] impute_cat = new int[vecs.length];
        for (int i = 0; i < vecs.length; i++)
          impute_cat[i] = vecs[i].isNumeric() ? -1 : DataInfo.imputeCat(vecs[i]);
        model._output._normSub = means;
        model._output._normMul = mults;
        // Initialize cluster centers and standardize if requested
        double[][] centers = initial_centers(model, vecs, means, mults, impute_cat);
        if (centers == null) return; // Stopped/cancelled during center-finding
        double[][] oldCenters = null;

        // ---
        // Run the main KMeans Clustering loop
        // Stop after enough iterations or average_change < TOLERANCE
        model._output._iterations =
            0; // Loop ends only when iterations > max_iterations with strict inequality
        while (!isDone(model, centers, oldCenters)) {
          Lloyds task =
              new Lloyds(centers, means, mults, impute_cat, _isCats, _parms._k, hasWeightCol())
                  .doAll(vecs);
          // Pick the max categorical level for cluster center
          max_cats(task._cMeans, task._cats, _isCats);

          // Handle the case where some centers go dry.  Rescue only 1 cluster
          // per iteration ('cause we only tracked the 1 worst row)
          if (cleanupBadClusters(task, vecs, centers, means, mults, impute_cat)) continue;

          // Compute model stats; update standardized cluster centers
          oldCenters = centers;
          centers = computeStatsFillModel(task, model, vecs, means, mults, impute_cat);

          model.update(_key); // Update model in K/V store
          update(1); // One unit of work
          if (model._parms._score_each_iteration) Log.info(model._output._model_summary);
        }

        Log.info(model._output._model_summary);
        //        Log.info(model._output._scoring_history);
        //
        // Log.info(((ModelMetricsClustering)model._output._training_metrics).createCentroidStatsTable().toString());

        // At the end: validation scoring (no need to gather scoring history)
        if (_valid != null) {
          model.score(_parms.valid()).delete(); // this appends a ModelMetrics on the validation set
          model._output._validation_metrics = ModelMetrics.getFromDKV(model, _parms.valid());
          model.update(_key); // Update model in K/V store
        }
        done(); // Job done!

      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        updateModelOutput();
        if (model != null) model.unlock(_key);
        _parms.read_unlock_frames(KMeans.this);
      }
      tryComplete();
    }
예제 #17
0
 @Override
 public Vec vresponse() {
   if (_vresponse_key == null) return response();
   return _vresponse != null ? _vresponse : (_vresponse = DKV.getGet(_vresponse_key));
 }
예제 #18
0
 @Override
 public Vec response() {
   return _response == null ? (_response = DKV.getGet(_response_key)) : _response;
 }
예제 #19
0
    @Override
    protected void compute2() {
      _model = null; // Resulting model!
      try {
        Scope.enter(); // Cleanup temp keys
        init(true); // Do any expensive tests & conversions now
        // Do lock even before checking the errors, since this block is finalized by unlock
        // (not the best solution, but the code is more readable)
        _parms.read_lock_frames(SharedTree.this); // Fetch & read-lock input frames
        if (error_count() > 0)
          throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this);

        // New Model?  Or continuing from a checkpoint?
        if (_parms._checkpoint && DKV.get(_parms._model_id) != null) {
          _model = DKV.get(_dest).get();
          _model.write_lock(_key); // do not delete previous model; we are extending it
        } else { // New Model
          // Compute the zero-tree error - guessing only the class distribution.
          // MSE is stddev squared when guessing for regression.
          // For classification, guess the largest class.
          _model =
              makeModel(
                  _dest,
                  _parms,
                  initial_MSE(_response, _response),
                  _valid == null
                      ? Double.NaN
                      : initial_MSE(_response, _vresponse)); // Make a fresh model
          _model.delete_and_lock(_key); // and clear & write-lock it (smashing any prior)
          _model._output._init_f = _initialPrediction;
        }

        // Compute the response domain; makes for nicer printouts
        String[] domain = _response.domain();
        assert (_nclass > 1 && domain != null) || (_nclass == 1 && domain == null);
        if (_nclass == 1) domain = new String[] {"r"}; // For regression, give a name to class 0

        // Compute class distribution, used to for initial guesses and to
        // upsample minority classes (if asked for).
        if (_nclass > 1) { // Classification?

          // Handle imbalanced classes by stratified over/under-sampling.
          // initWorkFrame sets the modeled class distribution, and
          // model.score() corrects the probabilities back using the
          // distribution ratios
          if (_model._output.isClassifier() && _parms._balance_classes) {

            float[] trainSamplingFactors =
                new float
                    [_train
                        .lastVec()
                        .domain()
                        .length]; // leave initialized to 0 -> will be filled up below
            if (_parms._class_sampling_factors != null) {
              if (_parms._class_sampling_factors.length != _train.lastVec().domain().length)
                throw new IllegalArgumentException(
                    "class_sampling_factors must have "
                        + _train.lastVec().domain().length
                        + " elements");
              trainSamplingFactors =
                  _parms._class_sampling_factors.clone(); // clone: don't modify the original
            }
            Frame stratified =
                water.util.MRUtils.sampleFrameStratified(
                    _train,
                    _train.lastVec(),
                    _train.vec(_model._output.weightsName()),
                    trainSamplingFactors,
                    (long) (_parms._max_after_balance_size * _train.numRows()),
                    _parms._seed,
                    true,
                    false);
            if (stratified != _train) {
              _train = stratified;
              _response = stratified.vec(_parms._response_column);
              _weights = stratified.vec(_parms._weights_column);
              // Recompute distribution since the input frame was modified
              MRUtils.ClassDist cdmt2 =
                  _weights != null
                      ? new MRUtils.ClassDist(_nclass).doAll(_response, _weights)
                      : new MRUtils.ClassDist(_nclass).doAll(_response);
              _model._output._distribution = cdmt2.dist();
              _model._output._modelClassDist = cdmt2.rel_dist();
            }
          }
          Log.info("Prior class distribution: " + Arrays.toString(_model._output._priorClassDist));
          Log.info("Model class distribution: " + Arrays.toString(_model._output._modelClassDist));
        }

        // Also add to the basic working Frame these sets:
        //   nclass Vecs of current forest results (sum across all trees)
        //   nclass Vecs of working/temp data
        //   nclass Vecs of NIDs, allowing 1 tree per class

        // Current forest values: results of summing the prior M trees
        for (int i = 0; i < _nclass; i++) _train.add("Tree_" + domain[i], _response.makeZero());

        // Initial work columns.  Set-before-use in the algos.
        for (int i = 0; i < _nclass; i++) _train.add("Work_" + domain[i], _response.makeZero());

        // One Tree per class, each tree needs a NIDs.  For empty classes use a -1
        // NID signifying an empty regression tree.
        for (int i = 0; i < _nclass; i++)
          _train.add(
              "NIDs_" + domain[i],
              _response.makeCon(
                  _model._output._distribution == null
                      ? 0
                      : (_model._output._distribution[i] == 0 ? -1 : 0)));

        // Tag out rows missing the response column
        new ExcludeNAResponse().doAll(_train);

        // Variable importance: squared-error-improvement-per-variable-per-split
        _improvPerVar = new float[_ncols];

        // Sub-class tree-model-builder specific build code
        buildModel();
        done(); // Job done!
      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        if (_model != null) _model.unlock(_key);
        _parms.read_unlock_frames(SharedTree.this);
        if (_model == null) Scope.exit();
        else {
          Scope.exit(
              _model._key,
              ModelMetrics.buildKey(_model, _parms.train()),
              ModelMetrics.buildKey(_model, _parms.valid()));
        }
      }
      tryComplete();
    }