Beispiel #1
0
    // Stopping criteria
    boolean isDone(KMeansModel model, double[][] newCenters, double[][] oldCenters) {
      if (!isRunning()) return true; // Stopped/cancelled
      // Stopped for running out iterations
      if (model._output._iterations >= _parms._max_iterations) return true;

      // Compute average change in standardized cluster centers
      if (oldCenters == null) return false; // No prior iteration, not stopping
      double average_change = 0;
      for (int clu = 0; clu < _parms._k; clu++)
        average_change +=
            hex.genmodel.GenModel.KMeans_distance(
                oldCenters[clu], newCenters[clu], _isCats, null, null);
      average_change /= _parms._k; // Average change per cluster
      model._output._avg_centroids_chg =
          ArrayUtils.copyAndFillOf(
              model._output._avg_centroids_chg,
              model._output._avg_centroids_chg.length + 1,
              average_change);
      model._output._training_time_ms =
          ArrayUtils.copyAndFillOf(
              model._output._training_time_ms,
              model._output._training_time_ms.length + 1,
              System.currentTimeMillis());
      return average_change < TOLERANCE;
    }
Beispiel #2
0
 /**
  * Bulk scoring API for one row. Chunks are all compatible with the model, and expect the last
  * Chunks are for the final distribution and prediction. Default method is to just load the data
  * into the tmp array, then call subclass scoring logic.
  */
 @Override
 protected double[] score0(
     double data[ /*ncols*/], double preds[ /*nclasses+1*/], double weight, double offset) {
   super.score0(data, preds, weight, offset); // These are f_k(x) in Algorithm 10.4
   if (_parms._distribution == Distributions.Family.bernoulli) {
     double f =
         preds[1]
             + _output._init_f
             + offset; // Note: class 1 probability stored in preds[1] (since we have only one
                       // tree)
     preds[2] = _parms._distribution.linkInv(f);
     preds[1] = 1.0 - preds[2];
   } else if (_parms._distribution
       == Distributions.Family.multinomial) { // Kept the initial prediction for binomial
     if (_output.nclasses() == 2) { // 1-tree optimization for binomial
       preds[1] +=
           _output._init_f
               + offset; // offset is not yet allowed, but added here to be future-proof
       preds[2] = -preds[1];
     }
     hex.genmodel.GenModel.GBM_rescale(preds);
   } else { // Regression
     double f = preds[0] + _output._init_f + offset;
     preds[0] = _parms._distribution.linkInv(f);
   }
   return preds;
 }
Beispiel #3
0
 // Call builder specific score code and then correct probabilities
 // if it is necessary.
 void score2(Chunk chks[], double weight, double offset, double fs[ /*nclass*/], int row) {
   double sum = score1(chks, weight, offset, fs, row);
   if (isClassifier()) {
     if (!Double.isInfinite(sum) && sum > 0f && sum != 1f) ArrayUtils.div(fs, sum);
     if (_parms._balance_classes)
       GenModel.correctProbabilities(
           fs, _model._output._priorClassDist, _model._output._modelClassDist);
   }
 }
Beispiel #4
0
 @Override
 public void map(Chunk[] cs) {
   for (int row = 0; row < cs[0]._len; row++) {
     double[] values = new double[cs.length];
     // fetch the data - using consistent NA and categorical data handling (same as for training)
     data(values, cs, row, _means, _mults, _modes);
     // compute the distance from the (standardized) cluster centroids
     _tss += hex.genmodel.GenModel.KMeans_distance(_gc, values, _isCats, null, null);
   }
 }
Beispiel #5
0
 /** Return both nearest of N cluster center/centroids, and the square-distance. */
 private static ClusterDist closest(
     double[][] centers, double[] point, String[][] isCats, ClusterDist cd, int count) {
   int min = -1;
   double minSqr = Double.MAX_VALUE;
   for (int cluster = 0; cluster < count; cluster++) {
     double sqr =
         hex.genmodel.GenModel.KMeans_distance(centers[cluster], point, isCats, null, null);
     if (sqr < minSqr) { // Record nearest cluster
       min = cluster;
       minSqr = sqr;
     }
   }
   cd._cluster = min; // Record nearest cluster
   cd._dist = minSqr; // Record square-distance
   return cd; // Return for flow-coding
 }
Beispiel #6
0
  // Note: For small probabilities, product may end up zero due to underflow error. Can circumvent
  // by taking logs.
  @Override
  protected double[] score0(double[] data, double[] preds) {
    double[] nums = new double[_output._levels.length]; // log(p(x,y)) for all levels of y
    assert preds.length
        == (_output._levels.length + 1); // Note: First column of preds is predicted response class

    // Compute joint probability of predictors for every response class
    for (int rlevel = 0; rlevel < _output._levels.length; rlevel++) {
      // Take logs to avoid overflow: p(x,y) = p(x|y)*p(y) -> log(p(x,y)) = log(p(x|y)) + log(p(y))
      nums[rlevel] = Math.log(_output._apriori_raw[rlevel]);

      for (int col = 0; col < _output._ncats; col++) {
        if (Double.isNaN(data[col])) continue; // Skip predictor in joint x_1,...,x_m if NA
        int plevel = (int) data[col];
        double prob =
            plevel < _output._pcond_raw.length
                ? _output._pcond_raw[col][rlevel][plevel]
                : _parms._laplace
                    / ((double) _output._rescnt[rlevel]
                        + _parms._laplace
                            * _output
                                ._domains[col]
                                .length); // Laplace smoothing if predictor level unobserved in
        // training set
        nums[rlevel] +=
            Math.log(
                prob <= _parms._eps_prob
                    ? _parms._min_prob
                    : prob); // log(p(x|y)) = \sum_{j = 1}^m p(x_j|y)
      }

      // For numeric predictors, assume Gaussian distribution with sample mean and variance from
      // model
      for (int col = _output._ncats; col < data.length; col++) {
        if (Double.isNaN(data[col])) continue; // Skip predictor in joint x_1,...,x_m if NA
        double x = data[col];
        double mean =
            Double.isNaN(_output._pcond_raw[col][rlevel][0])
                ? 0
                : _output._pcond_raw[col][rlevel][0];
        double stddev =
            Double.isNaN(_output._pcond_raw[col][rlevel][1])
                ? 1.0
                : (_output._pcond_raw[col][rlevel][1] <= _parms._eps_sdev
                    ? _parms._min_sdev
                    : _output._pcond_raw[col][rlevel][1]);
        // double prob = Math.exp(new NormalDistribution(mean, stddev).density(data[col])); //
        // slower
        double prob =
            Math.exp(-((x - mean) * (x - mean)) / (2. * stddev * stddev))
                / (stddev * Math.sqrt(2. * Math.PI)); // faster
        nums[rlevel] += Math.log(prob <= _parms._eps_prob ? _parms._min_prob : prob);
      }
    }

    // Numerically unstable:
    // p(x,y) = exp(log(p(x,y))), p(x) = \Sum_{r = levels of y} exp(log(p(x,y = r))) -> p(y|x) =
    // p(x,y)/p(x)
    // Instead, we rewrite using a more stable form:
    // p(y|x) = p(x,y)/p(x) = exp(log(p(x,y))) / (\Sum_{r = levels of y} exp(log(p(x,y = r)))
    //        = 1 / ( exp(-log(p(x,y))) * \Sum_{r = levels of y} exp(log(p(x,y = r))) )
    //        = 1 / ( \Sum_{r = levels of y} exp( log(p(x,y = r)) - log(p(x,y)) ))
    for (int i = 0; i < nums.length; i++) {
      double sum = 0;
      for (int j = 0; j < nums.length; j++) sum += Math.exp(nums[j] - nums[i]);
      preds[i + 1] = 1 / sum;
    }

    // Select class with highest conditional probability
    preds[0] = GenModel.getPrediction(preds, _output._priorClassDist, data, defaultThreshold());
    return preds;
  }