public GLMModelV3 make_model(int version, MakeGLMModelV3 args) { GLMModel model = DKV.getGet(args.model.key()); if (model == null) throw new IllegalArgumentException("missing source model " + args.model); String[] names = model._output.coefficientNames(); Map<String, Double> coefs = model.coefficients(); for (int i = 0; i < args.names.length; ++i) coefs.put(args.names[i], args.beta[i]); double[] beta = model.beta().clone(); for (int i = 0; i < beta.length; ++i) beta[i] = coefs.get(names[i]); GLMModel m = new GLMModel( args.dest != null ? args.dest.key() : Key.make(), model._parms, null, new double[] {.5}, Double.NaN, Double.NaN, -1); DataInfo dinfo = model.dinfo(); dinfo.setPredictorTransform(TransformType.NONE); // GLMOutput(DataInfo dinfo, String[] column_names, String[][] domains, String[] // coefficient_names, boolean binomial) { m._output = new GLMOutput( model.dinfo(), model._output._names, model._output._domains, model._output.coefficientNames(), model._output._binomial, beta); DKV.put(m._key, m); GLMModelV3 res = new GLMModelV3(); res.fillFromImpl(m); return res; }
/** * Helper to create the DataInfo object from training/validation frames and the DL parameters * * @param train Training frame * @param valid Validation frame * @param parms Model parameters * @param nClasses Number of response levels (1: regression, >=2: classification) * @return DataInfo */ static DataInfo makeDataInfo( Frame train, Frame valid, DeepLearningParameters parms, int nClasses) { double x = 0.782347234; boolean identityLink = new Distribution(parms._distribution, parms._tweedie_power).link(x) == x; DataInfo dinfo = new DataInfo( train, valid, parms._autoencoder ? 0 : 1, // nResponses parms._autoencoder || parms._use_all_factor_levels, // use all FactorLevels for auto-encoder parms._standardize ? (parms._autoencoder ? DataInfo.TransformType.NORMALIZE : parms._sparse ? DataInfo.TransformType.DESCALE : DataInfo.TransformType.STANDARDIZE) : DataInfo.TransformType.NONE, // transform predictors !parms._standardize || train.lastVec().isCategorical() ? DataInfo.TransformType.NONE : identityLink ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType .NONE, // transform response for regression with identity link parms._missing_values_handling == DeepLearningParameters.MissingValuesHandling.Skip, // whether to skip missing false, // do not replace NAs in numeric cols with mean true, // always add a bucket for missing values parms._weights_column != null, // observation weights parms._offset_column != null, parms._fold_column != null); // Checks and adjustments: // 1) observation weights (adjust mean/sigmas for predictors and response) // 2) NAs (check that there's enough rows left) GLMTask.YMUTask ymt = new GLMTask.YMUTask( dinfo, nClasses, true, !parms._autoencoder && nClasses == 1, false, !parms._autoencoder) .doAll(dinfo._adaptedFrame); if (ymt._wsum == 0 && parms._missing_values_handling == DeepLearningParameters.MissingValuesHandling.Skip) throw new H2OIllegalArgumentException( "No rows left in the dataset after filtering out rows with missing values. Ignore columns with many NAs or set missing_values_handling to 'MeanImputation'."); if (parms._weights_column != null && parms._offset_column != null) { Log.warn( "Combination of offset and weights can lead to slight differences because Rollupstats aren't weighted - need to re-calculate weighted mean/sigma of the response including offset terms."); } if (parms._weights_column != null && parms._offset_column == null /*FIXME: offset not yet implemented*/) { dinfo.updateWeightedSigmaAndMean(ymt._basicStats.sigma(), ymt._basicStats.mean()); if (nClasses == 1) dinfo.updateWeightedSigmaAndMeanForResponse( ymt._basicStatsResponse.sigma(), ymt._basicStatsResponse.mean()); } return dinfo; }
public GLMOutput(GLM glm) { super(glm); _dinfo = glm._dinfo; if (!glm.hasWeightCol()) { _dinfo = (DataInfo) _dinfo.clone(); _dinfo._adaptedFrame = new Frame(_dinfo._adaptedFrame.names().clone(), _dinfo._adaptedFrame.vecs().clone()); _dinfo.dropWeights(); } _scoringDinfo = _dinfo.scoringInfo(); String[] cnames = glm._dinfo.coefNames(); String[] names = _dinfo._adaptedFrame._names; String[][] domains = _dinfo._adaptedFrame.domains(); int id = glm._generatedWeights == null ? -1 : ArrayUtils.find(names, glm._generatedWeights); if (id >= 0) { String[] ns = new String[names.length - 1]; String[][] ds = new String[domains.length - 1][]; System.arraycopy(names, 0, ns, 0, id); System.arraycopy(domains, 0, ds, 0, id); System.arraycopy(names, id + 1, ns, id, ns.length - id); System.arraycopy(domains, id + 1, ds, id, ds.length - id); names = ns; domains = ds; } _names = names; _domains = domains; _coefficient_names = Arrays.copyOf(cnames, cnames.length + 1); _coefficient_names[_coefficient_names.length - 1] = "Intercept"; _binomial = glm._parms._family == Family.binomial; _nclasses = glm.nclasses(); _multinomial = _nclasses > 2; }
public double[][] getNormBetaMultinomial(int idx) { double[][] res = new double[nclasses()][]; Submodel sm = _submodels[idx]; int N = _dinfo.fullN() + 1; double[] beta = sm.beta; if (sm.idxs != null) beta = ArrayUtils.expandAndScatter(beta, nclasses() * (_dinfo.fullN() + 1), sm.idxs); for (int i = 0; i < res.length; ++i) res[i] = Arrays.copyOfRange(beta, i * N, (i + 1) * N); return res; }
public void setSubmodelIdx(int l) { _best_lambda_idx = l; if (_multinomial) { _global_beta_multinomial = getNormBetaMultinomial(l); for (int i = 0; i < _global_beta_multinomial.length; ++i) _global_beta_multinomial[i] = _dinfo.denormalizeBeta(_global_beta_multinomial[i]); } else { if (_global_beta == null) _global_beta = MemoryManager.malloc8d(_coefficient_names.length); else Arrays.fill(_global_beta, 0); _submodels[l].getBeta(_global_beta); _global_beta = _dinfo.denormalizeBeta(_global_beta); } }
protected void computeStatsFillModel( PCAModel pca, DataInfo dinfo, SingularValueDecomposition svd, Gram gram, long nobs) { // Save adapted frame info for scoring later pca._output._normSub = dinfo._normSub == null ? new double[dinfo._nums] : dinfo._normSub; if (dinfo._normMul == null) { pca._output._normMul = new double[dinfo._nums]; Arrays.fill(pca._output._normMul, 1.0); } else pca._output._normMul = dinfo._normMul; pca._output._permutation = dinfo._permutation; pca._output._nnums = dinfo._nums; pca._output._ncats = dinfo._cats; pca._output._catOffsets = dinfo._catOffsets; double dfcorr = nobs / (nobs - 1.0); double[] sval = svd.getSingularValues(); pca._output._std_deviation = new double[_parms._k]; // Only want first k standard deviations for (int i = 0; i < _parms._k; i++) { sval[i] = dfcorr * sval[ i]; // Degrees of freedom = n-1, where n = nobs = # row observations processed pca._output._std_deviation[i] = Math.sqrt(sval[i]); } double[][] eigvec = svd.getV().getArray(); pca._output._eigenvectors_raw = new double[eigvec.length][_parms._k]; // Only want first k eigenvectors for (int i = 0; i < eigvec.length; i++) System.arraycopy(eigvec[i], 0, pca._output._eigenvectors_raw[i], 0, _parms._k); pca._output._total_variance = dfcorr * gram.diagSum(); // Since gram = X'X/n, but variance requires n-1 in denominator buildTables(pca, dinfo.coefNames()); }
public DataInfo validDinfo(Frame valid) { DataInfo res = new DataInfo( _adaptedFrame, null, 1, _useAllFactorLevels, TransformType.NONE, TransformType.NONE, _skipMissing, _imputeMissing, false, _weights, _offset, _fold); res._adaptedFrame = new Frame(_adaptedFrame.names(), valid.vecs(_adaptedFrame.names())); res._valid = true; return res; }
public GLMOutput( DataInfo dinfo, String[] column_names, String[][] domains, String[] coefficient_names, boolean binomial) { super(dinfo._weights, dinfo._offset, dinfo._fold); _dinfo = dinfo; _scoringDinfo = dinfo.scoringInfo(); _names = column_names; _domains = domains; _coefficient_names = coefficient_names; _binomial = binomial; _nclasses = binomial ? 2 : 1; if (_binomial && domains[domains.length - 1] != null) { assert domains[domains.length - 1].length == 2 : "Unexpected domains " + Arrays.toString(domains); binomialClassNames = domains[domains.length - 1]; } }
public DataInfo scoringInfo() { DataInfo res = new DataInfo( _adaptedFrame, null, 1, _useAllFactorLevels, TransformType.NONE, TransformType.NONE, _skipMissing, _imputeMissing, !_skipMissing, _weights, _offset, _fold); res._adaptedFrame = null; res._weights = false; res._offset = false; res._fold = false; res._responses = 0; res._valid = true; res._interactions = _interactions; return res; }
/** * Train a Deep Learning model, assumes that all members are populated If checkpoint == null, * then start training a new model, otherwise continue from a checkpoint */ public final void buildModel() { DeepLearningModel cp = null; if (_parms._checkpoint == null) { cp = new DeepLearningModel( dest(), _parms, new DeepLearningModel.DeepLearningModelOutput(DeepLearning.this), _train, _valid, nclasses()); cp.model_info().initializeMembers(); } else { final DeepLearningModel previous = DKV.getGet(_parms._checkpoint); if (previous == null) throw new IllegalArgumentException("Checkpoint not found."); Log.info("Resuming from checkpoint."); _job.update(0, "Resuming from checkpoint"); if (isClassifier() != previous._output.isClassifier()) throw new H2OIllegalArgumentException( "Response type must be the same as for the checkpointed model."); if (isSupervised() != previous._output.isSupervised()) throw new H2OIllegalArgumentException( "Model type must be the same as for the checkpointed model."); // check the user-given arguments for consistency DeepLearningParameters oldP = previous._parms; // sanitized parameters for checkpointed model DeepLearningParameters newP = _parms; // user-given parameters for restart DeepLearningParameters oldP2 = (DeepLearningParameters) oldP.clone(); DeepLearningParameters newP2 = (DeepLearningParameters) newP.clone(); DeepLearningParameters.Sanity.modifyParms( oldP, oldP2, nclasses()); // sanitize the user-given parameters DeepLearningParameters.Sanity.modifyParms( newP, newP2, nclasses()); // sanitize the user-given parameters DeepLearningParameters.Sanity.checkpoint(oldP2, newP2); DataInfo dinfo; try { // PUBDEV-2513: Adapt _train and _valid (in-place) to match the frames that were used for // the previous model // This can add or remove dummy columns (can happen if the dataset is sparse and datasets // have different non-const columns) for (String st : previous.adaptTestForTrain(_train, true, false)) Log.warn(st); for (String st : previous.adaptTestForTrain(_valid, true, false)) Log.warn(st); dinfo = makeDataInfo(_train, _valid, _parms, nclasses()); DKV.put(dinfo); cp = new DeepLearningModel(dest(), _parms, previous, false, dinfo); cp.write_lock(_job); if (!Arrays.equals(cp._output._names, previous._output._names)) { throw new H2OIllegalArgumentException( "The columns of the training data must be the same as for the checkpointed model. Check ignored columns (or disable ignore_const_cols)."); } if (!Arrays.deepEquals(cp._output._domains, previous._output._domains)) { throw new H2OIllegalArgumentException( "Categorical factor levels of the training data must be the same as for the checkpointed model."); } if (dinfo.fullN() != previous.model_info().data_info().fullN()) { throw new H2OIllegalArgumentException( "Total number of predictors is different than for the checkpointed model."); } if (_parms._epochs <= previous.epoch_counter) { throw new H2OIllegalArgumentException( "Total number of epochs must be larger than the number of epochs already trained for the checkpointed model (" + previous.epoch_counter + ")."); } // these are the mutable parameters that are to be used by the model (stored in // model_info._parms) final DeepLearningParameters actualNewP = cp.model_info() .get_params(); // actually used parameters for model building (defaults filled in, // etc.) assert (actualNewP != previous.model_info().get_params()); assert (actualNewP != newP); assert (actualNewP != oldP); DeepLearningParameters.Sanity.update(actualNewP, newP, nclasses()); Log.info( "Continuing training after " + String.format("%.3f", previous.epoch_counter) + " epochs from the checkpointed model."); cp.update(_job); } catch (H2OIllegalArgumentException ex) { if (cp != null) { cp.unlock(_job); cp.delete(); cp = null; } throw ex; } finally { if (cp != null) cp.unlock(_job); } } trainModel(cp); // clean up, but don't delete weights and biases if user asked for export List<Key> keep = new ArrayList<>(); try { if (_parms._export_weights_and_biases && cp._output.weights != null && cp._output.biases != null) { for (Key k : Arrays.asList(cp._output.weights)) { keep.add(k); for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) { keep.add(vk._key); } } for (Key k : Arrays.asList(cp._output.biases)) { keep.add(k); for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) { keep.add(vk._key); } } } } finally { Scope.exit(keep.toArray(new Key[keep.size()])); } }
/** * Main constructor * * @param params Model parameters * @param dinfo Data Info * @param nClasses number of classes (1 for regression, 0 for autoencoder) * @param train User-given training data frame, prepared by AdaptTestTrain * @param valid User-specified validation data frame, prepared by AdaptTestTrain */ public DeepLearningModelInfo( final DeepLearningParameters params, final DataInfo dinfo, int nClasses, Frame train, Frame valid) { _classification = nClasses > 1; _train = train; _valid = valid; data_info = dinfo; parameters = (DeepLearningParameters) params.clone(); // make a copy, don't change model's parameters DeepLearningParameters.Sanity.modifyParms( parameters, parameters, nClasses); // sanitize the model_info's parameters final int num_input = dinfo.fullN(); final int num_output = get_params()._autoencoder ? num_input : (_classification ? train.lastVec().cardinality() : 1); if (!get_params()._autoencoder) assert (num_output == nClasses); _saw_missing_cats = dinfo._cats > 0 ? new boolean[data_info._cats] : null; assert (num_input > 0); assert (num_output > 0); if (has_momenta() && adaDelta()) throw new IllegalArgumentException( "Cannot have non-zero momentum and adaptive rate at the same time."); final int layers = get_params()._hidden.length; // units (# neurons for each layer) units = new int[layers + 2]; if (get_params()._max_categorical_features <= Integer.MAX_VALUE - dinfo._nums) units[0] = Math.min(dinfo._nums + get_params()._max_categorical_features, num_input); else units[0] = num_input; System.arraycopy(get_params()._hidden, 0, units, 1, layers); units[layers + 1] = num_output; boolean printLevels = units[0] > 1000L; boolean warn = units[0] > 100000L; if (printLevels) { final String[][] domains = dinfo._adaptedFrame.domains(); int[] levels = new int[domains.length]; for (int i = 0; i < levels.length; ++i) { levels[i] = domains[i] != null ? domains[i].length : 0; } Arrays.sort(levels); if (warn) { Log.warn( "==================================================================================================================================="); Log.warn( num_input + " input features" + (dinfo._cats > 0 ? " (after categorical one-hot encoding)" : "") + ". Can be slow and require a lot of memory."); } if (levels[levels.length - 1] > 0) { int levelcutoff = levels[levels.length - 1 - Math.min(10, levels.length - 1)]; int count = 0; for (int i = 0; i < dinfo._adaptedFrame.numCols() - (get_params()._autoencoder ? 0 : 1) && count < 10; ++i) { if (dinfo._adaptedFrame.domains()[i] != null && dinfo._adaptedFrame.domains()[i].length >= levelcutoff) { if (warn) { Log.warn( "Categorical feature '" + dinfo._adaptedFrame._names[i] + "' has cardinality " + dinfo._adaptedFrame.domains()[i].length + "."); } else { Log.info( "Categorical feature '" + dinfo._adaptedFrame._names[i] + "' has cardinality " + dinfo._adaptedFrame.domains()[i].length + "."); } } count++; } } if (warn) { Log.warn("Suggestions:"); Log.warn(" *) Limit the size of the first hidden layer"); if (dinfo._cats > 0) { Log.warn( " *) Limit the total number of one-hot encoded features with the parameter 'max_categorical_features'"); Log.warn( " *) Run h2o.interaction(...,pairwise=F) on high-cardinality categorical columns to limit the factor count, see http://learn.h2o.ai"); } Log.warn( "==================================================================================================================================="); } } // weights (to connect layers) dense_row_weights = new Storage.DenseRowMatrix[layers + 1]; dense_col_weights = new Storage.DenseColMatrix[layers + 1]; // decide format of weight matrices row-major or col-major if (get_params()._col_major) dense_col_weights[0] = new Storage.DenseColMatrix(units[1], units[0]); else dense_row_weights[0] = new Storage.DenseRowMatrix(units[1], units[0]); for (int i = 1; i <= layers; ++i) dense_row_weights[i] = new Storage.DenseRowMatrix(units[i + 1] /*rows*/, units[i] /*cols*/); // biases (only for hidden layers and output layer) biases = new Storage.DenseVector[layers + 1]; for (int i = 0; i <= layers; ++i) biases[i] = new Storage.DenseVector(units[i + 1]); // average activation (only for hidden layers) if (get_params()._autoencoder && get_params()._sparsity_beta > 0) { avg_activations = new Storage.DenseVector[layers]; mean_a = new float[layers]; for (int i = 0; i < layers; ++i) avg_activations[i] = new Storage.DenseVector(units[i + 1]); } allocateHelperArrays(); // for diagnostics mean_rate = new float[units.length]; rms_rate = new float[units.length]; mean_bias = new float[units.length]; rms_bias = new float[units.length]; mean_weight = new float[units.length]; rms_weight = new float[units.length]; }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return deep_clone(); int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = (cols[i++] - _catOffsets[j]) + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } int[] catModes = _catModes; for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] cs = new int[_cats - ignoredCnt][]; catModes = new int[_cats - ignoredCnt]; int y = 0; for (int c = 0; c < catLvls.length; ++c) if (catLvls[c] != null) { catModes[y] = _catModes[c]; cs[y++] = catLvls[c]; } assert y == cs.length; catLvls = cs; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id - hasIcpt; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off]; } // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, // TransformType predictor_transform, TransformType response_transform, boolean skipMissing, // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes); dinfo._activeCols = cols; return dinfo; }
@Override protected void setupLocal() { DataInfo dinfo = DKV.get(_dinfoKey).get(); _dinfo = _activeCols == null ? dinfo : dinfo.filterExpandedColumns(_activeCols); }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_jobKey != null && !Job.isRunning(_jobKey)) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0].start(); boolean doWork = chunkInit(); if (!doWork) return; final boolean obs_weights = _dinfo._weights && !_fr.vecs()[_dinfo.weightChunkId()].isConst(); final double global_weight_sum = obs_weights ? _fr.vecs()[_dinfo.weightChunkId()].mean() * _fr.numRows() : 0; DataInfo.Row row = _dinfo.newDenseRow(); double[] weight_map = null; double relative_chunk_weight = 1; // TODO: store node-local helper arrays in _dinfo -> avoid re-allocation and construction if (obs_weights) { weight_map = new double[nrows]; double weight_sum = 0; for (int i = 0; i < nrows; ++i) { row = _dinfo.extractDenseRow(chunks, i, row); weight_sum += row.weight; weight_map[i] = weight_sum; assert (i == 0 || row.weight == 0 || weight_map[i] > weight_map[i - 1]); } if (weight_sum > 0) { ArrayUtils.div(weight_map, weight_sum); // normalize to 0...1 relative_chunk_weight = global_weight_sum * nrows / _fr.numRows() / weight_sum; } else return; // nothing to do here - all rows have 0 weight } // Example: // _useFraction = 0.8 -> 1 repeat with fraction = 0.8 // _useFraction = 1.0 -> 1 repeat with fraction = 1.0 // _useFraction = 1.1 -> 2 repeats with fraction = 0.55 // _useFraction = 2.1 -> 3 repeats with fraction = 0.7 // _useFraction = 3.0 -> 3 repeats with fraction = 1.0 final int repeats = (int) Math.ceil(_useFraction * relative_chunk_weight); final float fraction = (float) (_useFraction * relative_chunk_weight) / repeats; assert (fraction <= 1.0); final boolean sample = (fraction < 0.999 || obs_weights || _shuffle); final Random skip_rng = sample ? RandomUtils.getRNG( (0x8734093502429734L + _seed + offset) * (_iteration + 0x9823423497823423L)) : null; long num_processed_rows = 0; for (int rep = 0; rep < repeats; ++rep) { for (int row_idx = 0; row_idx < nrows; ++row_idx) { int r = sample ? -1 : 0; // only train with a given number of training samples (fraction*nrows) if (sample && !obs_weights && skip_rng.nextDouble() > fraction) continue; if (obs_weights && num_processed_rows % 2 == 0) { // every second row is randomly sampled -> that way we won't "forget" rare // rows // importance sampling based on inverse of cumulative distribution double key = skip_rng.nextDouble(); r = Arrays.binarySearch(weight_map, 0, nrows, key); // Log.info(Arrays.toString(weight_map)); // Log.info("key: " + key + " idx: " + (r >= 0 ? r : (-r-1))); if (r < 0) r = -r - 1; assert (r == 0 || weight_map[r] > weight_map[r - 1]); } else if (r == -1) { do { r = skip_rng.nextInt(nrows); // random sampling (with replacement) } // if we have weights, and we did the %2 skipping above, then we need to find an alternate // row with non-zero weight while (obs_weights && ((r == 0 && weight_map[0] == 0) || (r > 0 && weight_map[r] == weight_map[r - 1]))); } else { assert (!obs_weights); r = row_idx; // linear scan - slightly faster } assert (r >= 0 && r <= nrows); row = _dinfo.extractDenseRow(chunks, r, row); if (!row.bad) { assert (row.weight > 0); // check that we never process a row that was held out via row.weight = 0 long seed = offset + rep * nrows + r; if (outputs != null && outputs.length > 0) processRow(seed++, row, outputs); else processRow(seed++, row); } num_processed_rows++; } } assert (fraction != 1 || num_processed_rows == repeats * nrows); chunkDone(num_processed_rows); }
// Main worker thread @Override protected void compute2() { PCAModel model = null; DataInfo dinfo = null; DataInfo xinfo = null; Frame x = null; try { init(true); // Initialize parameters _parms.read_lock_frames(PCA.this); // Fetch & read-lock input frames if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors()); // The model to be built model = new PCAModel(dest(), _parms, new PCAModel.PCAOutput(PCA.this)); model.delete_and_lock(_key); if (_parms._pca_method == PCAParameters.Method.GramSVD) { dinfo = new DataInfo( Key.make(), _train, null, 0, _parms._use_all_factor_levels, _parms._transform, DataInfo.TransformType.NONE, /* skipMissing */ true, /* missingBucket */ false, /* weights */ false, /* offset */ false, /* intercept */ false); DKV.put(dinfo._key, dinfo); // Calculate and save Gram matrix of training data // NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding // rows with NAs) GramTask gtsk = new Gram.GramTask(self(), dinfo).doAll(dinfo._adaptedFrame); Gram gram = gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing // values assert gram.fullN() == _ncolExp; // Compute SVD of Gram A'A/n using JAMA library // Note: Singular values ordered in weakly descending order by algorithm Matrix gramJ = new Matrix(gtsk._gram.getXX()); SingularValueDecomposition svdJ = gramJ.svd(); computeStatsFillModel(model, dinfo, svdJ, gram, gtsk._nobs); } else if (_parms._pca_method == PCAParameters.Method.Power) { SVDModel.SVDParameters parms = new SVDModel.SVDParameters(); parms._train = _parms._train; parms._ignored_columns = _parms._ignored_columns; parms._ignore_const_cols = _parms._ignore_const_cols; parms._score_each_iteration = _parms._score_each_iteration; parms._use_all_factor_levels = _parms._use_all_factor_levels; parms._transform = _parms._transform; parms._nv = _parms._k; parms._max_iterations = _parms._max_iterations; parms._seed = _parms._seed; // Calculate standard deviation and projection as well parms._only_v = false; parms._u_name = _parms._loading_name; parms._keep_u = _parms._keep_loading; SVDModel svd = null; SVD job = null; try { job = new EmbeddedSVD(_key, _progressKey, parms); svd = job.trainModel().get(); if (job.isCancelledOrCrashed()) PCA.this.cancel(); } finally { if (job != null) job.remove(); if (svd != null) svd.remove(); } // Recover PCA results from SVD model computeStatsFillModel(model, svd); } else if (_parms._pca_method == PCAParameters.Method.GLRM) { GLRMModel.GLRMParameters parms = new GLRMModel.GLRMParameters(); parms._train = _parms._train; parms._ignored_columns = _parms._ignored_columns; parms._ignore_const_cols = _parms._ignore_const_cols; parms._score_each_iteration = _parms._score_each_iteration; parms._transform = _parms._transform; parms._k = _parms._k; parms._max_iterations = _parms._max_iterations; parms._seed = _parms._seed; parms._recover_svd = true; parms._loss = GLRMModel.GLRMParameters.Loss.L2; parms._gamma_x = 0; parms._gamma_y = 0; GLRMModel glrm = null; GLRM job = null; try { job = new EmbeddedGLRM(_key, _progressKey, parms); glrm = job.trainModel().get(); if (job.isCancelledOrCrashed()) PCA.this.cancel(); } finally { if (job != null) job.remove(); if (glrm != null) { glrm._parms._loading_key.get().delete(); glrm.remove(); } } // Recover PCA results from GLRM model computeStatsFillModel(model, glrm); } model.update(self()); update(1); done(); } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { _parms.read_unlock_frames(PCA.this); if (model != null) model.unlock(_key); if (dinfo != null) dinfo.remove(); if (xinfo != null) xinfo.remove(); if (x != null && !_parms._keep_loading) x.delete(); } tryComplete(); }
public double[] getNormBeta() { return _submodels[_best_lambda_idx].getBeta(MemoryManager.malloc8d(_dinfo.fullN() + 1)); }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return this; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = cols[i++] - _catOffsets[j] + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] c = new int[_cats - ignoredCnt][]; int y = 0; for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl; assert y == c.length; catLvls = c; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off]; } DataInfo dinfo = new DataInfo( _key, f, normMul, normSub, catLvls, _responses, _predictor_transform, _response_transform, _skipMissing, _imputeMissing, _weights, _offset, _fold); // do not put activeData into K/V - active data is recreated on each node based on active // columns dinfo._activeCols = cols; return dinfo; }
// Main worker thread @Override protected void compute2() { KMeansModel model = null; try { init(true); // Do lock even before checking the errors, since this block is finalized by unlock // (not the best solution, but the code is more readable) _parms.read_lock_frames(KMeans.this); // Fetch & read-lock input frames // Something goes wrong if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(KMeans.this); // The model to be built model = new KMeansModel(dest(), _parms, new KMeansModel.KMeansOutput(KMeans.this)); model.delete_and_lock(_key); // final Vec vecs[] = _train.vecs(); // mults & means for standardization final double[] means = _train.means(); // means are used to impute NAs final double[] mults = _parms._standardize ? _train.mults() : null; final int[] impute_cat = new int[vecs.length]; for (int i = 0; i < vecs.length; i++) impute_cat[i] = vecs[i].isNumeric() ? -1 : DataInfo.imputeCat(vecs[i]); model._output._normSub = means; model._output._normMul = mults; // Initialize cluster centers and standardize if requested double[][] centers = initial_centers(model, vecs, means, mults, impute_cat); if (centers == null) return; // Stopped/cancelled during center-finding double[][] oldCenters = null; // --- // Run the main KMeans Clustering loop // Stop after enough iterations or average_change < TOLERANCE model._output._iterations = 0; // Loop ends only when iterations > max_iterations with strict inequality while (!isDone(model, centers, oldCenters)) { Lloyds task = new Lloyds(centers, means, mults, impute_cat, _isCats, _parms._k, hasWeightCol()) .doAll(vecs); // Pick the max categorical level for cluster center max_cats(task._cMeans, task._cats, _isCats); // Handle the case where some centers go dry. Rescue only 1 cluster // per iteration ('cause we only tracked the 1 worst row) if (cleanupBadClusters(task, vecs, centers, means, mults, impute_cat)) continue; // Compute model stats; update standardized cluster centers oldCenters = centers; centers = computeStatsFillModel(task, model, vecs, means, mults, impute_cat); model.update(_key); // Update model in K/V store update(1); // One unit of work if (model._parms._score_each_iteration) Log.info(model._output._model_summary); } Log.info(model._output._model_summary); // Log.info(model._output._scoring_history); // // Log.info(((ModelMetricsClustering)model._output._training_metrics).createCentroidStatsTable().toString()); // At the end: validation scoring (no need to gather scoring history) if (_valid != null) { model.score(_parms.valid()).delete(); // this appends a ModelMetrics on the validation set model._output._validation_metrics = ModelMetrics.getFromDKV(model, _parms.valid()); model.update(_key); // Update model in K/V store } done(); // Job done! } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { updateModelOutput(); if (model != null) model.unlock(_key); _parms.read_unlock_frames(KMeans.this); } tryComplete(); }
@Override protected void compute2() { CoxPHModel model = null; try { Scope.enter(); _parms.read_lock_frames(CoxPH.this); init(true); applyScoringFrameSideEffects(); // The model to be built model = new CoxPHModel(dest(), _parms, new CoxPHModel.CoxPHOutput(CoxPH.this)); model.delete_and_lock(_key); applyTrainingFrameSideEffects(); int nResponses = 1; boolean useAllFactorLevels = false; final DataInfo dinfo = new DataInfo( Key.make(), _modelBuilderTrain, null, nResponses, useAllFactorLevels, DataInfo.TransformType.DEMEAN, TransformType.NONE, true, false, false, false, false, false); initStats(model, dinfo); final int n_offsets = (model._parms.offset_columns == null) ? 0 : model._parms.offset_columns.length; final int n_coef = dinfo.fullN() - n_offsets; final double[] step = MemoryManager.malloc8d(n_coef); final double[] oldCoef = MemoryManager.malloc8d(n_coef); final double[] newCoef = MemoryManager.malloc8d(n_coef); Arrays.fill(step, Double.NaN); Arrays.fill(oldCoef, Double.NaN); for (int j = 0; j < n_coef; ++j) newCoef[j] = model._parms.init; double oldLoglik = -Double.MAX_VALUE; final int n_time = (int) (model._output.max_time - model._output.min_time + 1); final boolean has_start_column = (model._parms.start_column != null); final boolean has_weights_column = (model._parms.weights_column != null); for (int i = 0; i <= model._parms.iter_max; ++i) { model._output.iter = i; final CoxPHTask coxMR = new CoxPHTask( self(), dinfo, newCoef, model._output.min_time, n_time, n_offsets, has_start_column, has_weights_column) .doAll(dinfo._adaptedFrame); final double newLoglik = calcLoglik(model, coxMR); if (newLoglik > oldLoglik) { if (i == 0) calcCounts(model, coxMR); calcModelStats(model, newCoef, newLoglik); calcCumhaz_0(model, coxMR); if (newLoglik == 0) model._output.lre = -Math.log10(Math.abs(oldLoglik - newLoglik)); else model._output.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik)); if (model._output.lre >= model._parms.lre_min) break; Arrays.fill(step, 0); for (int j = 0; j < n_coef; ++j) for (int k = 0; k < n_coef; ++k) step[j] -= model._output.var_coef[j][k] * model._output.gradient[k]; for (int j = 0; j < n_coef; ++j) if (Double.isNaN(step[j]) || Double.isInfinite(step[j])) break; oldLoglik = newLoglik; System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length); } else { for (int j = 0; j < n_coef; ++j) step[j] /= 2; } for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j]; } model.update(_key); } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { updateModelOutput(); _parms.read_unlock_frames(CoxPH.this); Scope.exit(); done(); // Job done! } tryComplete(); }