// Stopping criteria boolean isDone(KMeansModel model, double[][] newCenters, double[][] oldCenters) { if (!isRunning()) return true; // Stopped/cancelled // Stopped for running out iterations if (model._output._iterations >= _parms._max_iterations) return true; // Compute average change in standardized cluster centers if (oldCenters == null) return false; // No prior iteration, not stopping double average_change = 0; for (int clu = 0; clu < _parms._k; clu++) average_change += hex.genmodel.GenModel.KMeans_distance( oldCenters[clu], newCenters[clu], _isCats, null, null); average_change /= _parms._k; // Average change per cluster model._output._avg_centroids_chg = ArrayUtils.copyAndFillOf( model._output._avg_centroids_chg, model._output._avg_centroids_chg.length + 1, average_change); model._output._training_time_ms = ArrayUtils.copyAndFillOf( model._output._training_time_ms, model._output._training_time_ms.length + 1, System.currentTimeMillis()); return average_change < TOLERANCE; }
/** * Bulk scoring API for one row. Chunks are all compatible with the model, and expect the last * Chunks are for the final distribution and prediction. Default method is to just load the data * into the tmp array, then call subclass scoring logic. */ @Override protected double[] score0( double data[ /*ncols*/], double preds[ /*nclasses+1*/], double weight, double offset) { super.score0(data, preds, weight, offset); // These are f_k(x) in Algorithm 10.4 if (_parms._distribution == Distributions.Family.bernoulli) { double f = preds[1] + _output._init_f + offset; // Note: class 1 probability stored in preds[1] (since we have only one // tree) preds[2] = _parms._distribution.linkInv(f); preds[1] = 1.0 - preds[2]; } else if (_parms._distribution == Distributions.Family.multinomial) { // Kept the initial prediction for binomial if (_output.nclasses() == 2) { // 1-tree optimization for binomial preds[1] += _output._init_f + offset; // offset is not yet allowed, but added here to be future-proof preds[2] = -preds[1]; } hex.genmodel.GenModel.GBM_rescale(preds); } else { // Regression double f = preds[0] + _output._init_f + offset; preds[0] = _parms._distribution.linkInv(f); } return preds; }
// Call builder specific score code and then correct probabilities // if it is necessary. void score2(Chunk chks[], double weight, double offset, double fs[ /*nclass*/], int row) { double sum = score1(chks, weight, offset, fs, row); if (isClassifier()) { if (!Double.isInfinite(sum) && sum > 0f && sum != 1f) ArrayUtils.div(fs, sum); if (_parms._balance_classes) GenModel.correctProbabilities( fs, _model._output._priorClassDist, _model._output._modelClassDist); } }
@Override public void map(Chunk[] cs) { for (int row = 0; row < cs[0]._len; row++) { double[] values = new double[cs.length]; // fetch the data - using consistent NA and categorical data handling (same as for training) data(values, cs, row, _means, _mults, _modes); // compute the distance from the (standardized) cluster centroids _tss += hex.genmodel.GenModel.KMeans_distance(_gc, values, _isCats, null, null); } }
/** Return both nearest of N cluster center/centroids, and the square-distance. */ private static ClusterDist closest( double[][] centers, double[] point, String[][] isCats, ClusterDist cd, int count) { int min = -1; double minSqr = Double.MAX_VALUE; for (int cluster = 0; cluster < count; cluster++) { double sqr = hex.genmodel.GenModel.KMeans_distance(centers[cluster], point, isCats, null, null); if (sqr < minSqr) { // Record nearest cluster min = cluster; minSqr = sqr; } } cd._cluster = min; // Record nearest cluster cd._dist = minSqr; // Record square-distance return cd; // Return for flow-coding }
// Note: For small probabilities, product may end up zero due to underflow error. Can circumvent // by taking logs. @Override protected double[] score0(double[] data, double[] preds) { double[] nums = new double[_output._levels.length]; // log(p(x,y)) for all levels of y assert preds.length == (_output._levels.length + 1); // Note: First column of preds is predicted response class // Compute joint probability of predictors for every response class for (int rlevel = 0; rlevel < _output._levels.length; rlevel++) { // Take logs to avoid overflow: p(x,y) = p(x|y)*p(y) -> log(p(x,y)) = log(p(x|y)) + log(p(y)) nums[rlevel] = Math.log(_output._apriori_raw[rlevel]); for (int col = 0; col < _output._ncats; col++) { if (Double.isNaN(data[col])) continue; // Skip predictor in joint x_1,...,x_m if NA int plevel = (int) data[col]; double prob = plevel < _output._pcond_raw.length ? _output._pcond_raw[col][rlevel][plevel] : _parms._laplace / ((double) _output._rescnt[rlevel] + _parms._laplace * _output ._domains[col] .length); // Laplace smoothing if predictor level unobserved in // training set nums[rlevel] += Math.log( prob <= _parms._eps_prob ? _parms._min_prob : prob); // log(p(x|y)) = \sum_{j = 1}^m p(x_j|y) } // For numeric predictors, assume Gaussian distribution with sample mean and variance from // model for (int col = _output._ncats; col < data.length; col++) { if (Double.isNaN(data[col])) continue; // Skip predictor in joint x_1,...,x_m if NA double x = data[col]; double mean = Double.isNaN(_output._pcond_raw[col][rlevel][0]) ? 0 : _output._pcond_raw[col][rlevel][0]; double stddev = Double.isNaN(_output._pcond_raw[col][rlevel][1]) ? 1.0 : (_output._pcond_raw[col][rlevel][1] <= _parms._eps_sdev ? _parms._min_sdev : _output._pcond_raw[col][rlevel][1]); // double prob = Math.exp(new NormalDistribution(mean, stddev).density(data[col])); // // slower double prob = Math.exp(-((x - mean) * (x - mean)) / (2. * stddev * stddev)) / (stddev * Math.sqrt(2. * Math.PI)); // faster nums[rlevel] += Math.log(prob <= _parms._eps_prob ? _parms._min_prob : prob); } } // Numerically unstable: // p(x,y) = exp(log(p(x,y))), p(x) = \Sum_{r = levels of y} exp(log(p(x,y = r))) -> p(y|x) = // p(x,y)/p(x) // Instead, we rewrite using a more stable form: // p(y|x) = p(x,y)/p(x) = exp(log(p(x,y))) / (\Sum_{r = levels of y} exp(log(p(x,y = r))) // = 1 / ( exp(-log(p(x,y))) * \Sum_{r = levels of y} exp(log(p(x,y = r))) ) // = 1 / ( \Sum_{r = levels of y} exp( log(p(x,y = r)) - log(p(x,y)) )) for (int i = 0; i < nums.length; i++) { double sum = 0; for (int j = 0; j < nums.length; j++) sum += Math.exp(nums[j] - nums[i]); preds[i + 1] = 1 / sum; } // Select class with highest conditional probability preds[0] = GenModel.getPrediction(preds, _output._priorClassDist, data, defaultThreshold()); return preds; }