/** * Initialization of neural net weights cf. * http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf */ private void randomizeWeights() { for (int w = 0; w < dense_row_weights.length; ++w) { final Random rng = water.util.RandomUtils.getRNG( get_params()._seed + 0xBAD5EED + w + 1); // to match NeuralNet behavior final double range = Math.sqrt(6. / (units[w] + units[w + 1])); for (int i = 0; i < get_weights(w).rows(); i++) { for (int j = 0; j < get_weights(w).cols(); j++) { if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.UniformAdaptive) { // cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf if (w == dense_row_weights.length - 1 && _classification) get_weights(w) .set( i, j, (float) (4. * uniformDist( rng, -range, range))); // Softmax might need an extra factor 4, since it's like // a sigmoid else get_weights(w).set(i, j, (float) uniformDist(rng, -range, range)); } else if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.Uniform) { get_weights(w) .set( i, j, (float) uniformDist( rng, -get_params()._initial_weight_scale, get_params()._initial_weight_scale)); } else if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.Normal) { get_weights(w) .set(i, j, (float) (rng.nextGaussian() * get_params()._initial_weight_scale)); } } } } }
@Override public void map(Chunk[] cs) { int N = cs.length - (_hasWeight ? 1 : 0); double[] values = new double[N]; ArrayList<double[]> list = new ArrayList<>(); Random rand = RandomUtils.getRNG(_seed + cs[0].start()); ClusterDist cd = new ClusterDist(); for (int row = 0; row < cs[0]._len; row++) { data(values, cs, row, _means, _mults, _modes); double sqr = minSqr(_centers, values, _isCats, cd); if (_probability * sqr > rand.nextDouble() * _sqr) list.add(values.clone()); } _sampled = new double[list.size()][]; list.toArray(_sampled); _centers = null; _means = _mults = null; _modes = null; }
// Initialize cluster centers double[][] initial_centers( KMeansModel model, final Vec[] vecs, final double[] means, final double[] mults, final int[] modes) { // Categoricals use a different distance metric than numeric columns. model._output._categorical_column_count = 0; _isCats = new String[vecs.length][]; for (int v = 0; v < vecs.length; v++) { _isCats[v] = vecs[v].isCategorical() ? new String[0] : null; if (_isCats[v] != null) model._output._categorical_column_count++; } Random rand = water.util.RandomUtils.getRNG(_parms._seed - 1); double centers[][]; // Cluster centers if (null != _parms._user_points) { // User-specified starting points Frame user_points = _parms._user_points.get(); int numCenters = (int) user_points.numRows(); int numCols = model._output.nfeatures(); centers = new double[numCenters][numCols]; Vec[] centersVecs = user_points.vecs(); // Get the centers and standardize them if requested for (int r = 0; r < numCenters; r++) { for (int c = 0; c < numCols; c++) { centers[r][c] = centersVecs[c].at(r); centers[r][c] = data(centers[r][c], c, means, mults, modes); } } } else { // Random, Furthest, or PlusPlus initialization if (_parms._init == Initialization.Random) { // Initialize all cluster centers to random rows centers = new double[_parms._k][model._output.nfeatures()]; for (double[] center : centers) randomRow(vecs, rand, center, means, mults, modes); } else { centers = new double[1][model._output.nfeatures()]; // Initialize first cluster center to random row randomRow(vecs, rand, centers[0], means, mults, modes); model._output._iterations = 0; while (model._output._iterations < 5) { // Sum squares distances to cluster center SumSqr sqr = new SumSqr(centers, means, mults, modes, _isCats).doAll(vecs); // Sample with probability inverse to square distance Sampler sampler = new Sampler( centers, means, mults, modes, _isCats, sqr._sqr, _parms._k * 3, _parms._seed, hasWeightCol()) .doAll(vecs); centers = ArrayUtils.append(centers, sampler._sampled); // Fill in sample centers into the model if (!isRunning()) return null; // Stopped/cancelled model._output._centers_raw = destandardize(centers, _isCats, means, mults); model._output._tot_withinss = sqr._sqr / _train.numRows(); model._output._iterations++; // One iteration done model.update( _key); // Make early version of model visible, but don't update progress using // update(1) } // Recluster down to k cluster centers centers = recluster(centers, rand, _parms._k, _parms._init, _isCats); model._output._iterations = 0; // Reset iteration count } } return centers; }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_jobKey != null && !Job.isRunning(_jobKey)) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0].start(); boolean doWork = chunkInit(); if (!doWork) return; final boolean obs_weights = _dinfo._weights && !_fr.vecs()[_dinfo.weightChunkId()].isConst(); final double global_weight_sum = obs_weights ? _fr.vecs()[_dinfo.weightChunkId()].mean() * _fr.numRows() : 0; DataInfo.Row row = _dinfo.newDenseRow(); double[] weight_map = null; double relative_chunk_weight = 1; // TODO: store node-local helper arrays in _dinfo -> avoid re-allocation and construction if (obs_weights) { weight_map = new double[nrows]; double weight_sum = 0; for (int i = 0; i < nrows; ++i) { row = _dinfo.extractDenseRow(chunks, i, row); weight_sum += row.weight; weight_map[i] = weight_sum; assert (i == 0 || row.weight == 0 || weight_map[i] > weight_map[i - 1]); } if (weight_sum > 0) { ArrayUtils.div(weight_map, weight_sum); // normalize to 0...1 relative_chunk_weight = global_weight_sum * nrows / _fr.numRows() / weight_sum; } else return; // nothing to do here - all rows have 0 weight } // Example: // _useFraction = 0.8 -> 1 repeat with fraction = 0.8 // _useFraction = 1.0 -> 1 repeat with fraction = 1.0 // _useFraction = 1.1 -> 2 repeats with fraction = 0.55 // _useFraction = 2.1 -> 3 repeats with fraction = 0.7 // _useFraction = 3.0 -> 3 repeats with fraction = 1.0 final int repeats = (int) Math.ceil(_useFraction * relative_chunk_weight); final float fraction = (float) (_useFraction * relative_chunk_weight) / repeats; assert (fraction <= 1.0); final boolean sample = (fraction < 0.999 || obs_weights || _shuffle); final Random skip_rng = sample ? RandomUtils.getRNG( (0x8734093502429734L + _seed + offset) * (_iteration + 0x9823423497823423L)) : null; long num_processed_rows = 0; for (int rep = 0; rep < repeats; ++rep) { for (int row_idx = 0; row_idx < nrows; ++row_idx) { int r = sample ? -1 : 0; // only train with a given number of training samples (fraction*nrows) if (sample && !obs_weights && skip_rng.nextDouble() > fraction) continue; if (obs_weights && num_processed_rows % 2 == 0) { // every second row is randomly sampled -> that way we won't "forget" rare // rows // importance sampling based on inverse of cumulative distribution double key = skip_rng.nextDouble(); r = Arrays.binarySearch(weight_map, 0, nrows, key); // Log.info(Arrays.toString(weight_map)); // Log.info("key: " + key + " idx: " + (r >= 0 ? r : (-r-1))); if (r < 0) r = -r - 1; assert (r == 0 || weight_map[r] > weight_map[r - 1]); } else if (r == -1) { do { r = skip_rng.nextInt(nrows); // random sampling (with replacement) } // if we have weights, and we did the %2 skipping above, then we need to find an alternate // row with non-zero weight while (obs_weights && ((r == 0 && weight_map[0] == 0) || (r > 0 && weight_map[r] == weight_map[r - 1]))); } else { assert (!obs_weights); r = row_idx; // linear scan - slightly faster } assert (r >= 0 && r <= nrows); row = _dinfo.extractDenseRow(chunks, r, row); if (!row.bad) { assert (row.weight > 0); // check that we never process a row that was held out via row.weight = 0 long seed = offset + rep * nrows + r; if (outputs != null && outputs.length > 0) processRow(seed++, row, outputs); else processRow(seed++, row); } num_processed_rows++; } } assert (fraction != 1 || num_processed_rows == repeats * nrows); chunkDone(num_processed_rows); }