Пример #1
0
 public GLMModelV3 make_model(int version, MakeGLMModelV3 args) {
   GLMModel model = DKV.getGet(args.model.key());
   if (model == null) throw new IllegalArgumentException("missing source model " + args.model);
   String[] names = model._output.coefficientNames();
   Map<String, Double> coefs = model.coefficients();
   for (int i = 0; i < args.names.length; ++i) coefs.put(args.names[i], args.beta[i]);
   double[] beta = model.beta().clone();
   for (int i = 0; i < beta.length; ++i) beta[i] = coefs.get(names[i]);
   GLMModel m =
       new GLMModel(
           args.dest != null ? args.dest.key() : Key.make(),
           model._parms,
           null,
           new double[] {.5},
           Double.NaN,
           Double.NaN,
           -1);
   DataInfo dinfo = model.dinfo();
   dinfo.setPredictorTransform(TransformType.NONE);
   // GLMOutput(DataInfo dinfo, String[] column_names, String[][] domains, String[]
   // coefficient_names, boolean binomial) {
   m._output =
       new GLMOutput(
           model.dinfo(),
           model._output._names,
           model._output._domains,
           model._output.coefficientNames(),
           model._output._binomial,
           beta);
   DKV.put(m._key, m);
   GLMModelV3 res = new GLMModelV3();
   res.fillFromImpl(m);
   return res;
 }
Пример #2
0
 /**
  * Helper to create the DataInfo object from training/validation frames and the DL parameters
  *
  * @param train Training frame
  * @param valid Validation frame
  * @param parms Model parameters
  * @param nClasses Number of response levels (1: regression, >=2: classification)
  * @return DataInfo
  */
 static DataInfo makeDataInfo(
     Frame train, Frame valid, DeepLearningParameters parms, int nClasses) {
   double x = 0.782347234;
   boolean identityLink = new Distribution(parms._distribution, parms._tweedie_power).link(x) == x;
   DataInfo dinfo =
       new DataInfo(
           train,
           valid,
           parms._autoencoder ? 0 : 1, // nResponses
           parms._autoencoder
               || parms._use_all_factor_levels, // use all FactorLevels for auto-encoder
           parms._standardize
               ? (parms._autoencoder
                   ? DataInfo.TransformType.NORMALIZE
                   : parms._sparse
                       ? DataInfo.TransformType.DESCALE
                       : DataInfo.TransformType.STANDARDIZE)
               : DataInfo.TransformType.NONE, // transform predictors
           !parms._standardize || train.lastVec().isCategorical()
               ? DataInfo.TransformType.NONE
               : identityLink
                   ? DataInfo.TransformType.STANDARDIZE
                   : DataInfo.TransformType
                       .NONE, // transform response for regression with identity link
           parms._missing_values_handling
               == DeepLearningParameters.MissingValuesHandling.Skip, // whether to skip missing
           false, // do not replace NAs in numeric cols with mean
           true, // always add a bucket for missing values
           parms._weights_column != null, // observation weights
           parms._offset_column != null,
           parms._fold_column != null);
   // Checks and adjustments:
   // 1) observation weights (adjust mean/sigmas for predictors and response)
   // 2) NAs (check that there's enough rows left)
   GLMTask.YMUTask ymt =
       new GLMTask.YMUTask(
               dinfo,
               nClasses,
               true,
               !parms._autoencoder && nClasses == 1,
               false,
               !parms._autoencoder)
           .doAll(dinfo._adaptedFrame);
   if (ymt._wsum == 0
       && parms._missing_values_handling == DeepLearningParameters.MissingValuesHandling.Skip)
     throw new H2OIllegalArgumentException(
         "No rows left in the dataset after filtering out rows with missing values. Ignore columns with many NAs or set missing_values_handling to 'MeanImputation'.");
   if (parms._weights_column != null && parms._offset_column != null) {
     Log.warn(
         "Combination of offset and weights can lead to slight differences because Rollupstats aren't weighted - need to re-calculate weighted mean/sigma of the response including offset terms.");
   }
   if (parms._weights_column != null
       && parms._offset_column == null /*FIXME: offset not yet implemented*/) {
     dinfo.updateWeightedSigmaAndMean(ymt._basicStats.sigma(), ymt._basicStats.mean());
     if (nClasses == 1)
       dinfo.updateWeightedSigmaAndMeanForResponse(
           ymt._basicStatsResponse.sigma(), ymt._basicStatsResponse.mean());
   }
   return dinfo;
 }
Пример #3
0
 public GLMOutput(GLM glm) {
   super(glm);
   _dinfo = glm._dinfo;
   if (!glm.hasWeightCol()) {
     _dinfo = (DataInfo) _dinfo.clone();
     _dinfo._adaptedFrame =
         new Frame(_dinfo._adaptedFrame.names().clone(), _dinfo._adaptedFrame.vecs().clone());
     _dinfo.dropWeights();
   }
   _scoringDinfo = _dinfo.scoringInfo();
   String[] cnames = glm._dinfo.coefNames();
   String[] names = _dinfo._adaptedFrame._names;
   String[][] domains = _dinfo._adaptedFrame.domains();
   int id = glm._generatedWeights == null ? -1 : ArrayUtils.find(names, glm._generatedWeights);
   if (id >= 0) {
     String[] ns = new String[names.length - 1];
     String[][] ds = new String[domains.length - 1][];
     System.arraycopy(names, 0, ns, 0, id);
     System.arraycopy(domains, 0, ds, 0, id);
     System.arraycopy(names, id + 1, ns, id, ns.length - id);
     System.arraycopy(domains, id + 1, ds, id, ds.length - id);
     names = ns;
     domains = ds;
   }
   _names = names;
   _domains = domains;
   _coefficient_names = Arrays.copyOf(cnames, cnames.length + 1);
   _coefficient_names[_coefficient_names.length - 1] = "Intercept";
   _binomial = glm._parms._family == Family.binomial;
   _nclasses = glm.nclasses();
   _multinomial = _nclasses > 2;
 }
Пример #4
0
 public double[][] getNormBetaMultinomial(int idx) {
   double[][] res = new double[nclasses()][];
   Submodel sm = _submodels[idx];
   int N = _dinfo.fullN() + 1;
   double[] beta = sm.beta;
   if (sm.idxs != null)
     beta = ArrayUtils.expandAndScatter(beta, nclasses() * (_dinfo.fullN() + 1), sm.idxs);
   for (int i = 0; i < res.length; ++i) res[i] = Arrays.copyOfRange(beta, i * N, (i + 1) * N);
   return res;
 }
Пример #5
0
 public void setSubmodelIdx(int l) {
   _best_lambda_idx = l;
   if (_multinomial) {
     _global_beta_multinomial = getNormBetaMultinomial(l);
     for (int i = 0; i < _global_beta_multinomial.length; ++i)
       _global_beta_multinomial[i] = _dinfo.denormalizeBeta(_global_beta_multinomial[i]);
   } else {
     if (_global_beta == null) _global_beta = MemoryManager.malloc8d(_coefficient_names.length);
     else Arrays.fill(_global_beta, 0);
     _submodels[l].getBeta(_global_beta);
     _global_beta = _dinfo.denormalizeBeta(_global_beta);
   }
 }
Пример #6
0
    protected void computeStatsFillModel(
        PCAModel pca, DataInfo dinfo, SingularValueDecomposition svd, Gram gram, long nobs) {
      // Save adapted frame info for scoring later
      pca._output._normSub = dinfo._normSub == null ? new double[dinfo._nums] : dinfo._normSub;
      if (dinfo._normMul == null) {
        pca._output._normMul = new double[dinfo._nums];
        Arrays.fill(pca._output._normMul, 1.0);
      } else pca._output._normMul = dinfo._normMul;
      pca._output._permutation = dinfo._permutation;
      pca._output._nnums = dinfo._nums;
      pca._output._ncats = dinfo._cats;
      pca._output._catOffsets = dinfo._catOffsets;

      double dfcorr = nobs / (nobs - 1.0);
      double[] sval = svd.getSingularValues();
      pca._output._std_deviation = new double[_parms._k]; // Only want first k standard deviations
      for (int i = 0; i < _parms._k; i++) {
        sval[i] =
            dfcorr
                * sval[
                    i]; // Degrees of freedom = n-1, where n = nobs = # row observations processed
        pca._output._std_deviation[i] = Math.sqrt(sval[i]);
      }

      double[][] eigvec = svd.getV().getArray();
      pca._output._eigenvectors_raw =
          new double[eigvec.length][_parms._k]; // Only want first k eigenvectors
      for (int i = 0; i < eigvec.length; i++)
        System.arraycopy(eigvec[i], 0, pca._output._eigenvectors_raw[i], 0, _parms._k);
      pca._output._total_variance =
          dfcorr * gram.diagSum(); // Since gram = X'X/n, but variance requires n-1 in denominator
      buildTables(pca, dinfo.coefNames());
    }
Пример #7
0
 public DataInfo validDinfo(Frame valid) {
   DataInfo res =
       new DataInfo(
           _adaptedFrame,
           null,
           1,
           _useAllFactorLevels,
           TransformType.NONE,
           TransformType.NONE,
           _skipMissing,
           _imputeMissing,
           false,
           _weights,
           _offset,
           _fold);
   res._adaptedFrame = new Frame(_adaptedFrame.names(), valid.vecs(_adaptedFrame.names()));
   res._valid = true;
   return res;
 }
Пример #8
0
    public GLMOutput(
        DataInfo dinfo,
        String[] column_names,
        String[][] domains,
        String[] coefficient_names,
        boolean binomial) {
      super(dinfo._weights, dinfo._offset, dinfo._fold);
      _dinfo = dinfo;
      _scoringDinfo = dinfo.scoringInfo();
      _names = column_names;
      _domains = domains;
      _coefficient_names = coefficient_names;
      _binomial = binomial;
      _nclasses = binomial ? 2 : 1;

      if (_binomial && domains[domains.length - 1] != null) {
        assert domains[domains.length - 1].length == 2
            : "Unexpected domains " + Arrays.toString(domains);
        binomialClassNames = domains[domains.length - 1];
      }
    }
Пример #9
0
 public DataInfo scoringInfo() {
   DataInfo res =
       new DataInfo(
           _adaptedFrame,
           null,
           1,
           _useAllFactorLevels,
           TransformType.NONE,
           TransformType.NONE,
           _skipMissing,
           _imputeMissing,
           !_skipMissing,
           _weights,
           _offset,
           _fold);
   res._adaptedFrame = null;
   res._weights = false;
   res._offset = false;
   res._fold = false;
   res._responses = 0;
   res._valid = true;
   res._interactions = _interactions;
   return res;
 }
Пример #10
0
    /**
     * Train a Deep Learning model, assumes that all members are populated If checkpoint == null,
     * then start training a new model, otherwise continue from a checkpoint
     */
    public final void buildModel() {
      DeepLearningModel cp = null;
      if (_parms._checkpoint == null) {
        cp =
            new DeepLearningModel(
                dest(),
                _parms,
                new DeepLearningModel.DeepLearningModelOutput(DeepLearning.this),
                _train,
                _valid,
                nclasses());
        cp.model_info().initializeMembers();
      } else {
        final DeepLearningModel previous = DKV.getGet(_parms._checkpoint);
        if (previous == null) throw new IllegalArgumentException("Checkpoint not found.");
        Log.info("Resuming from checkpoint.");
        _job.update(0, "Resuming from checkpoint");

        if (isClassifier() != previous._output.isClassifier())
          throw new H2OIllegalArgumentException(
              "Response type must be the same as for the checkpointed model.");
        if (isSupervised() != previous._output.isSupervised())
          throw new H2OIllegalArgumentException(
              "Model type must be the same as for the checkpointed model.");

        // check the user-given arguments for consistency
        DeepLearningParameters oldP =
            previous._parms; // sanitized parameters for checkpointed model
        DeepLearningParameters newP = _parms; // user-given parameters for restart

        DeepLearningParameters oldP2 = (DeepLearningParameters) oldP.clone();
        DeepLearningParameters newP2 = (DeepLearningParameters) newP.clone();
        DeepLearningParameters.Sanity.modifyParms(
            oldP, oldP2, nclasses()); // sanitize the user-given parameters
        DeepLearningParameters.Sanity.modifyParms(
            newP, newP2, nclasses()); // sanitize the user-given parameters
        DeepLearningParameters.Sanity.checkpoint(oldP2, newP2);

        DataInfo dinfo;
        try {
          // PUBDEV-2513: Adapt _train and _valid (in-place) to match the frames that were used for
          // the previous model
          // This can add or remove dummy columns (can happen if the dataset is sparse and datasets
          // have different non-const columns)
          for (String st : previous.adaptTestForTrain(_train, true, false)) Log.warn(st);
          for (String st : previous.adaptTestForTrain(_valid, true, false)) Log.warn(st);
          dinfo = makeDataInfo(_train, _valid, _parms, nclasses());
          DKV.put(dinfo);
          cp = new DeepLearningModel(dest(), _parms, previous, false, dinfo);
          cp.write_lock(_job);

          if (!Arrays.equals(cp._output._names, previous._output._names)) {
            throw new H2OIllegalArgumentException(
                "The columns of the training data must be the same as for the checkpointed model. Check ignored columns (or disable ignore_const_cols).");
          }
          if (!Arrays.deepEquals(cp._output._domains, previous._output._domains)) {
            throw new H2OIllegalArgumentException(
                "Categorical factor levels of the training data must be the same as for the checkpointed model.");
          }
          if (dinfo.fullN() != previous.model_info().data_info().fullN()) {
            throw new H2OIllegalArgumentException(
                "Total number of predictors is different than for the checkpointed model.");
          }
          if (_parms._epochs <= previous.epoch_counter) {
            throw new H2OIllegalArgumentException(
                "Total number of epochs must be larger than the number of epochs already trained for the checkpointed model ("
                    + previous.epoch_counter
                    + ").");
          }

          // these are the mutable parameters that are to be used by the model (stored in
          // model_info._parms)
          final DeepLearningParameters actualNewP =
              cp.model_info()
                  .get_params(); // actually used parameters for model building (defaults filled in,
                                 // etc.)
          assert (actualNewP != previous.model_info().get_params());
          assert (actualNewP != newP);
          assert (actualNewP != oldP);
          DeepLearningParameters.Sanity.update(actualNewP, newP, nclasses());

          Log.info(
              "Continuing training after "
                  + String.format("%.3f", previous.epoch_counter)
                  + " epochs from the checkpointed model.");
          cp.update(_job);
        } catch (H2OIllegalArgumentException ex) {
          if (cp != null) {
            cp.unlock(_job);
            cp.delete();
            cp = null;
          }
          throw ex;
        } finally {
          if (cp != null) cp.unlock(_job);
        }
      }
      trainModel(cp);

      // clean up, but don't delete weights and biases if user asked for export
      List<Key> keep = new ArrayList<>();
      try {
        if (_parms._export_weights_and_biases
            && cp._output.weights != null
            && cp._output.biases != null) {
          for (Key k : Arrays.asList(cp._output.weights)) {
            keep.add(k);
            for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) {
              keep.add(vk._key);
            }
          }
          for (Key k : Arrays.asList(cp._output.biases)) {
            keep.add(k);
            for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) {
              keep.add(vk._key);
            }
          }
        }
      } finally {
        Scope.exit(keep.toArray(new Key[keep.size()]));
      }
    }
Пример #11
0
  /**
   * Main constructor
   *
   * @param params Model parameters
   * @param dinfo Data Info
   * @param nClasses number of classes (1 for regression, 0 for autoencoder)
   * @param train User-given training data frame, prepared by AdaptTestTrain
   * @param valid User-specified validation data frame, prepared by AdaptTestTrain
   */
  public DeepLearningModelInfo(
      final DeepLearningParameters params,
      final DataInfo dinfo,
      int nClasses,
      Frame train,
      Frame valid) {
    _classification = nClasses > 1;
    _train = train;
    _valid = valid;
    data_info = dinfo;
    parameters =
        (DeepLearningParameters) params.clone(); // make a copy, don't change model's parameters
    DeepLearningParameters.Sanity.modifyParms(
        parameters, parameters, nClasses); // sanitize the model_info's parameters

    final int num_input = dinfo.fullN();
    final int num_output =
        get_params()._autoencoder
            ? num_input
            : (_classification ? train.lastVec().cardinality() : 1);
    if (!get_params()._autoencoder) assert (num_output == nClasses);

    _saw_missing_cats = dinfo._cats > 0 ? new boolean[data_info._cats] : null;
    assert (num_input > 0);
    assert (num_output > 0);
    if (has_momenta() && adaDelta())
      throw new IllegalArgumentException(
          "Cannot have non-zero momentum and adaptive rate at the same time.");
    final int layers = get_params()._hidden.length;
    // units (# neurons for each layer)
    units = new int[layers + 2];
    if (get_params()._max_categorical_features <= Integer.MAX_VALUE - dinfo._nums)
      units[0] = Math.min(dinfo._nums + get_params()._max_categorical_features, num_input);
    else units[0] = num_input;
    System.arraycopy(get_params()._hidden, 0, units, 1, layers);
    units[layers + 1] = num_output;

    boolean printLevels = units[0] > 1000L;
    boolean warn = units[0] > 100000L;
    if (printLevels) {
      final String[][] domains = dinfo._adaptedFrame.domains();
      int[] levels = new int[domains.length];
      for (int i = 0; i < levels.length; ++i) {
        levels[i] = domains[i] != null ? domains[i].length : 0;
      }
      Arrays.sort(levels);
      if (warn) {
        Log.warn(
            "===================================================================================================================================");
        Log.warn(
            num_input
                + " input features"
                + (dinfo._cats > 0 ? " (after categorical one-hot encoding)" : "")
                + ". Can be slow and require a lot of memory.");
      }
      if (levels[levels.length - 1] > 0) {
        int levelcutoff = levels[levels.length - 1 - Math.min(10, levels.length - 1)];
        int count = 0;
        for (int i = 0;
            i < dinfo._adaptedFrame.numCols() - (get_params()._autoencoder ? 0 : 1) && count < 10;
            ++i) {
          if (dinfo._adaptedFrame.domains()[i] != null
              && dinfo._adaptedFrame.domains()[i].length >= levelcutoff) {
            if (warn) {
              Log.warn(
                  "Categorical feature '"
                      + dinfo._adaptedFrame._names[i]
                      + "' has cardinality "
                      + dinfo._adaptedFrame.domains()[i].length
                      + ".");
            } else {
              Log.info(
                  "Categorical feature '"
                      + dinfo._adaptedFrame._names[i]
                      + "' has cardinality "
                      + dinfo._adaptedFrame.domains()[i].length
                      + ".");
            }
          }
          count++;
        }
      }
      if (warn) {
        Log.warn("Suggestions:");
        Log.warn(" *) Limit the size of the first hidden layer");
        if (dinfo._cats > 0) {
          Log.warn(
              " *) Limit the total number of one-hot encoded features with the parameter 'max_categorical_features'");
          Log.warn(
              " *) Run h2o.interaction(...,pairwise=F) on high-cardinality categorical columns to limit the factor count, see http://learn.h2o.ai");
        }
        Log.warn(
            "===================================================================================================================================");
      }
    }

    // weights (to connect layers)
    dense_row_weights = new Storage.DenseRowMatrix[layers + 1];
    dense_col_weights = new Storage.DenseColMatrix[layers + 1];

    // decide format of weight matrices row-major or col-major
    if (get_params()._col_major)
      dense_col_weights[0] = new Storage.DenseColMatrix(units[1], units[0]);
    else dense_row_weights[0] = new Storage.DenseRowMatrix(units[1], units[0]);
    for (int i = 1; i <= layers; ++i)
      dense_row_weights[i] = new Storage.DenseRowMatrix(units[i + 1] /*rows*/, units[i] /*cols*/);

    // biases (only for hidden layers and output layer)
    biases = new Storage.DenseVector[layers + 1];
    for (int i = 0; i <= layers; ++i) biases[i] = new Storage.DenseVector(units[i + 1]);
    // average activation (only for hidden layers)
    if (get_params()._autoencoder && get_params()._sparsity_beta > 0) {
      avg_activations = new Storage.DenseVector[layers];
      mean_a = new float[layers];
      for (int i = 0; i < layers; ++i) avg_activations[i] = new Storage.DenseVector(units[i + 1]);
    }
    allocateHelperArrays();
    // for diagnostics
    mean_rate = new float[units.length];
    rms_rate = new float[units.length];
    mean_bias = new float[units.length];
    rms_bias = new float[units.length];
    mean_weight = new float[units.length];
    rms_weight = new float[units.length];
  }
Пример #12
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return deep_clone();
   int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = (cols[i++] - _catOffsets[j]) + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   int[] catModes = _catModes;
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] cs = new int[_cats - ignoredCnt][];
     catModes = new int[_cats - ignoredCnt];
     int y = 0;
     for (int c = 0; c < catLvls.length; ++c)
       if (catLvls[c] != null) {
         catModes[y] = _catModes[c];
         cs[y++] = catLvls[c];
       }
     assert y == cs.length;
     catLvls = cs;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id - hasIcpt;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels,
   // TransformType predictor_transform, TransformType response_transform, boolean skipMissing,
   // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) {
   DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes);
   dinfo._activeCols = cols;
   return dinfo;
 }
Пример #13
0
 @Override
 protected void setupLocal() {
   DataInfo dinfo = DKV.get(_dinfoKey).get();
   _dinfo = _activeCols == null ? dinfo : dinfo.filterExpandedColumns(_activeCols);
 }
Пример #14
0
  /**
   * Extracts the values, applies regularization to numerics, adds appropriate offsets to
   * categoricals, and adapts response according to the CaseMode/CaseValue if set.
   */
  @Override
  public final void map(Chunk[] chunks, NewChunk[] outputs) {
    if (_jobKey != null && !Job.isRunning(_jobKey)) throw new JobCancelledException();
    final int nrows = chunks[0]._len;
    final long offset = chunks[0].start();
    boolean doWork = chunkInit();
    if (!doWork) return;
    final boolean obs_weights = _dinfo._weights && !_fr.vecs()[_dinfo.weightChunkId()].isConst();
    final double global_weight_sum =
        obs_weights ? _fr.vecs()[_dinfo.weightChunkId()].mean() * _fr.numRows() : 0;

    DataInfo.Row row = _dinfo.newDenseRow();
    double[] weight_map = null;
    double relative_chunk_weight = 1;
    // TODO: store node-local helper arrays in _dinfo -> avoid re-allocation and construction
    if (obs_weights) {
      weight_map = new double[nrows];
      double weight_sum = 0;
      for (int i = 0; i < nrows; ++i) {
        row = _dinfo.extractDenseRow(chunks, i, row);
        weight_sum += row.weight;
        weight_map[i] = weight_sum;
        assert (i == 0 || row.weight == 0 || weight_map[i] > weight_map[i - 1]);
      }
      if (weight_sum > 0) {
        ArrayUtils.div(weight_map, weight_sum); // normalize to 0...1
        relative_chunk_weight = global_weight_sum * nrows / _fr.numRows() / weight_sum;
      } else return; // nothing to do here - all rows have 0 weight
    }

    // Example:
    // _useFraction = 0.8 -> 1 repeat with fraction = 0.8
    // _useFraction = 1.0 -> 1 repeat with fraction = 1.0
    // _useFraction = 1.1 -> 2 repeats with fraction = 0.55
    // _useFraction = 2.1 -> 3 repeats with fraction = 0.7
    // _useFraction = 3.0 -> 3 repeats with fraction = 1.0
    final int repeats = (int) Math.ceil(_useFraction * relative_chunk_weight);
    final float fraction = (float) (_useFraction * relative_chunk_weight) / repeats;
    assert (fraction <= 1.0);

    final boolean sample = (fraction < 0.999 || obs_weights || _shuffle);
    final Random skip_rng =
        sample
            ? RandomUtils.getRNG(
                (0x8734093502429734L + _seed + offset) * (_iteration + 0x9823423497823423L))
            : null;

    long num_processed_rows = 0;
    for (int rep = 0; rep < repeats; ++rep) {
      for (int row_idx = 0; row_idx < nrows; ++row_idx) {
        int r = sample ? -1 : 0;
        // only train with a given number of training samples (fraction*nrows)
        if (sample && !obs_weights && skip_rng.nextDouble() > fraction) continue;
        if (obs_weights
            && num_processed_rows % 2
                == 0) { // every second row is randomly sampled -> that way we won't "forget" rare
          // rows
          // importance sampling based on inverse of cumulative distribution
          double key = skip_rng.nextDouble();
          r = Arrays.binarySearch(weight_map, 0, nrows, key);
          //          Log.info(Arrays.toString(weight_map));
          //          Log.info("key: " + key + " idx: " + (r >= 0 ? r : (-r-1)));
          if (r < 0) r = -r - 1;
          assert (r == 0 || weight_map[r] > weight_map[r - 1]);
        } else if (r == -1) {
          do {
            r = skip_rng.nextInt(nrows); // random sampling (with replacement)
          }
          // if we have weights, and we did the %2 skipping above, then we need to find an alternate
          // row with non-zero weight
          while (obs_weights
              && ((r == 0 && weight_map[0] == 0) || (r > 0 && weight_map[r] == weight_map[r - 1])));
        } else {
          assert (!obs_weights);
          r = row_idx; // linear scan - slightly faster
        }
        assert (r >= 0 && r <= nrows);

        row = _dinfo.extractDenseRow(chunks, r, row);
        if (!row.bad) {
          assert (row.weight
              > 0); // check that we never process a row that was held out via row.weight = 0
          long seed = offset + rep * nrows + r;
          if (outputs != null && outputs.length > 0) processRow(seed++, row, outputs);
          else processRow(seed++, row);
        }
        num_processed_rows++;
      }
    }
    assert (fraction != 1 || num_processed_rows == repeats * nrows);
    chunkDone(num_processed_rows);
  }
Пример #15
0
    // Main worker thread
    @Override
    protected void compute2() {
      PCAModel model = null;
      DataInfo dinfo = null;
      DataInfo xinfo = null;
      Frame x = null;

      try {
        init(true); // Initialize parameters
        _parms.read_lock_frames(PCA.this); // Fetch & read-lock input frames
        if (error_count() > 0)
          throw new IllegalArgumentException("Found validation errors: " + validationErrors());

        // The model to be built
        model = new PCAModel(dest(), _parms, new PCAModel.PCAOutput(PCA.this));
        model.delete_and_lock(_key);

        if (_parms._pca_method == PCAParameters.Method.GramSVD) {
          dinfo =
              new DataInfo(
                  Key.make(),
                  _train,
                  null,
                  0,
                  _parms._use_all_factor_levels,
                  _parms._transform,
                  DataInfo.TransformType.NONE,
                  /* skipMissing */ true, /* missingBucket */
                  false, /* weights */
                  false, /* offset */
                  false, /* intercept */
                  false);
          DKV.put(dinfo._key, dinfo);

          // Calculate and save Gram matrix of training data
          // NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding
          // rows with NAs)
          GramTask gtsk = new Gram.GramTask(self(), dinfo).doAll(dinfo._adaptedFrame);
          Gram gram =
              gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing
          // values
          assert gram.fullN() == _ncolExp;

          // Compute SVD of Gram A'A/n using JAMA library
          // Note: Singular values ordered in weakly descending order by algorithm
          Matrix gramJ = new Matrix(gtsk._gram.getXX());
          SingularValueDecomposition svdJ = gramJ.svd();
          computeStatsFillModel(model, dinfo, svdJ, gram, gtsk._nobs);

        } else if (_parms._pca_method == PCAParameters.Method.Power) {
          SVDModel.SVDParameters parms = new SVDModel.SVDParameters();
          parms._train = _parms._train;
          parms._ignored_columns = _parms._ignored_columns;
          parms._ignore_const_cols = _parms._ignore_const_cols;
          parms._score_each_iteration = _parms._score_each_iteration;
          parms._use_all_factor_levels = _parms._use_all_factor_levels;
          parms._transform = _parms._transform;
          parms._nv = _parms._k;
          parms._max_iterations = _parms._max_iterations;
          parms._seed = _parms._seed;

          // Calculate standard deviation and projection as well
          parms._only_v = false;
          parms._u_name = _parms._loading_name;
          parms._keep_u = _parms._keep_loading;

          SVDModel svd = null;
          SVD job = null;
          try {
            job = new EmbeddedSVD(_key, _progressKey, parms);
            svd = job.trainModel().get();
            if (job.isCancelledOrCrashed()) PCA.this.cancel();
          } finally {
            if (job != null) job.remove();
            if (svd != null) svd.remove();
          }
          // Recover PCA results from SVD model
          computeStatsFillModel(model, svd);

        } else if (_parms._pca_method == PCAParameters.Method.GLRM) {
          GLRMModel.GLRMParameters parms = new GLRMModel.GLRMParameters();
          parms._train = _parms._train;
          parms._ignored_columns = _parms._ignored_columns;
          parms._ignore_const_cols = _parms._ignore_const_cols;
          parms._score_each_iteration = _parms._score_each_iteration;
          parms._transform = _parms._transform;
          parms._k = _parms._k;
          parms._max_iterations = _parms._max_iterations;
          parms._seed = _parms._seed;

          parms._recover_svd = true;
          parms._loss = GLRMModel.GLRMParameters.Loss.L2;
          parms._gamma_x = 0;
          parms._gamma_y = 0;

          GLRMModel glrm = null;
          GLRM job = null;
          try {
            job = new EmbeddedGLRM(_key, _progressKey, parms);
            glrm = job.trainModel().get();
            if (job.isCancelledOrCrashed()) PCA.this.cancel();
          } finally {
            if (job != null) job.remove();
            if (glrm != null) {
              glrm._parms._loading_key.get().delete();
              glrm.remove();
            }
          }
          // Recover PCA results from GLRM model
          computeStatsFillModel(model, glrm);
        }

        model.update(self());
        update(1);
        done();
      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        _parms.read_unlock_frames(PCA.this);
        if (model != null) model.unlock(_key);
        if (dinfo != null) dinfo.remove();
        if (xinfo != null) xinfo.remove();
        if (x != null && !_parms._keep_loading) x.delete();
      }
      tryComplete();
    }
Пример #16
0
 public double[] getNormBeta() {
   return _submodels[_best_lambda_idx].getBeta(MemoryManager.malloc8d(_dinfo.fullN() + 1));
 }
Пример #17
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return this;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = cols[i++] - _catOffsets[j] + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] c = new int[_cats - ignoredCnt][];
     int y = 0;
     for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl;
     assert y == c.length;
     catLvls = c;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   DataInfo dinfo =
       new DataInfo(
           _key,
           f,
           normMul,
           normSub,
           catLvls,
           _responses,
           _predictor_transform,
           _response_transform,
           _skipMissing,
           _imputeMissing,
           _weights,
           _offset,
           _fold);
   // do not put activeData into K/V - active data is recreated on each node based on active
   // columns
   dinfo._activeCols = cols;
   return dinfo;
 }
Пример #18
0
    // Main worker thread
    @Override
    protected void compute2() {

      KMeansModel model = null;
      try {
        init(true);
        // Do lock even before checking the errors, since this block is finalized by unlock
        // (not the best solution, but the code is more readable)
        _parms.read_lock_frames(KMeans.this); // Fetch & read-lock input frames
        // Something goes wrong
        if (error_count() > 0)
          throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(KMeans.this);
        // The model to be built
        model = new KMeansModel(dest(), _parms, new KMeansModel.KMeansOutput(KMeans.this));
        model.delete_and_lock(_key);

        //
        final Vec vecs[] = _train.vecs();
        // mults & means for standardization
        final double[] means = _train.means(); // means are used to impute NAs
        final double[] mults = _parms._standardize ? _train.mults() : null;
        final int[] impute_cat = new int[vecs.length];
        for (int i = 0; i < vecs.length; i++)
          impute_cat[i] = vecs[i].isNumeric() ? -1 : DataInfo.imputeCat(vecs[i]);
        model._output._normSub = means;
        model._output._normMul = mults;
        // Initialize cluster centers and standardize if requested
        double[][] centers = initial_centers(model, vecs, means, mults, impute_cat);
        if (centers == null) return; // Stopped/cancelled during center-finding
        double[][] oldCenters = null;

        // ---
        // Run the main KMeans Clustering loop
        // Stop after enough iterations or average_change < TOLERANCE
        model._output._iterations =
            0; // Loop ends only when iterations > max_iterations with strict inequality
        while (!isDone(model, centers, oldCenters)) {
          Lloyds task =
              new Lloyds(centers, means, mults, impute_cat, _isCats, _parms._k, hasWeightCol())
                  .doAll(vecs);
          // Pick the max categorical level for cluster center
          max_cats(task._cMeans, task._cats, _isCats);

          // Handle the case where some centers go dry.  Rescue only 1 cluster
          // per iteration ('cause we only tracked the 1 worst row)
          if (cleanupBadClusters(task, vecs, centers, means, mults, impute_cat)) continue;

          // Compute model stats; update standardized cluster centers
          oldCenters = centers;
          centers = computeStatsFillModel(task, model, vecs, means, mults, impute_cat);

          model.update(_key); // Update model in K/V store
          update(1); // One unit of work
          if (model._parms._score_each_iteration) Log.info(model._output._model_summary);
        }

        Log.info(model._output._model_summary);
        //        Log.info(model._output._scoring_history);
        //
        // Log.info(((ModelMetricsClustering)model._output._training_metrics).createCentroidStatsTable().toString());

        // At the end: validation scoring (no need to gather scoring history)
        if (_valid != null) {
          model.score(_parms.valid()).delete(); // this appends a ModelMetrics on the validation set
          model._output._validation_metrics = ModelMetrics.getFromDKV(model, _parms.valid());
          model.update(_key); // Update model in K/V store
        }
        done(); // Job done!

      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        updateModelOutput();
        if (model != null) model.unlock(_key);
        _parms.read_unlock_frames(KMeans.this);
      }
      tryComplete();
    }
Пример #19
0
    @Override
    protected void compute2() {
      CoxPHModel model = null;
      try {
        Scope.enter();
        _parms.read_lock_frames(CoxPH.this);
        init(true);

        applyScoringFrameSideEffects();

        // The model to be built
        model = new CoxPHModel(dest(), _parms, new CoxPHModel.CoxPHOutput(CoxPH.this));
        model.delete_and_lock(_key);

        applyTrainingFrameSideEffects();

        int nResponses = 1;
        boolean useAllFactorLevels = false;
        final DataInfo dinfo =
            new DataInfo(
                Key.make(),
                _modelBuilderTrain,
                null,
                nResponses,
                useAllFactorLevels,
                DataInfo.TransformType.DEMEAN,
                TransformType.NONE,
                true,
                false,
                false,
                false,
                false,
                false);
        initStats(model, dinfo);

        final int n_offsets =
            (model._parms.offset_columns == null) ? 0 : model._parms.offset_columns.length;
        final int n_coef = dinfo.fullN() - n_offsets;
        final double[] step = MemoryManager.malloc8d(n_coef);
        final double[] oldCoef = MemoryManager.malloc8d(n_coef);
        final double[] newCoef = MemoryManager.malloc8d(n_coef);
        Arrays.fill(step, Double.NaN);
        Arrays.fill(oldCoef, Double.NaN);
        for (int j = 0; j < n_coef; ++j) newCoef[j] = model._parms.init;
        double oldLoglik = -Double.MAX_VALUE;
        final int n_time = (int) (model._output.max_time - model._output.min_time + 1);
        final boolean has_start_column = (model._parms.start_column != null);
        final boolean has_weights_column = (model._parms.weights_column != null);
        for (int i = 0; i <= model._parms.iter_max; ++i) {
          model._output.iter = i;

          final CoxPHTask coxMR =
              new CoxPHTask(
                      self(),
                      dinfo,
                      newCoef,
                      model._output.min_time,
                      n_time,
                      n_offsets,
                      has_start_column,
                      has_weights_column)
                  .doAll(dinfo._adaptedFrame);

          final double newLoglik = calcLoglik(model, coxMR);
          if (newLoglik > oldLoglik) {
            if (i == 0) calcCounts(model, coxMR);

            calcModelStats(model, newCoef, newLoglik);
            calcCumhaz_0(model, coxMR);

            if (newLoglik == 0) model._output.lre = -Math.log10(Math.abs(oldLoglik - newLoglik));
            else model._output.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik));
            if (model._output.lre >= model._parms.lre_min) break;

            Arrays.fill(step, 0);
            for (int j = 0; j < n_coef; ++j)
              for (int k = 0; k < n_coef; ++k)
                step[j] -= model._output.var_coef[j][k] * model._output.gradient[k];
            for (int j = 0; j < n_coef; ++j)
              if (Double.isNaN(step[j]) || Double.isInfinite(step[j])) break;

            oldLoglik = newLoglik;
            System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length);
          } else {
            for (int j = 0; j < n_coef; ++j) step[j] /= 2;
          }

          for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j];
        }

        model.update(_key);
      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        updateModelOutput();
        _parms.read_unlock_frames(CoxPH.this);
        Scope.exit();
        done(); // Job done!
      }
      tryComplete();
    }