Пример #1
0
 @Test
 public void testDomains() {
   Frame frame = parse_test_file("smalldata/junit/weather.csv");
   for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) {
     Vec v = frame.vec(s);
     Vec newV = v.toCategoricalVec();
     frame.remove(s);
     frame.add(s, newV);
     v.remove();
   }
   DKV.put(frame);
   AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
   parms._train = frame._key;
   parms._radius_scale = 10;
   AggregatorModel agg = new Aggregator(parms).trainModel().get();
   Frame output = agg._output._output_frame.get();
   Assert.assertTrue(output.numRows() < 0.5 * frame.numRows());
   boolean same = true;
   for (int i = 0; i < frame.numCols(); ++i) {
     if (frame.vec(i).isCategorical()) {
       same = (frame.domains()[i].length == output.domains()[i].length);
       if (!same) break;
     }
   }
   frame.remove();
   output.remove();
   agg.remove();
   Assert.assertFalse(same);
 }
Пример #2
0
  @Test
  public void testImpute() {
    Frame fr = null;
    try {
      // Impute fuel economy via the "mean" method, no.
      String tree = "(h2o.impute hex 1 \"mean\" \"low\" [])";
      fr = chkTree(tree, "smalldata/junit/cars.csv");
      chkDim(fr, 8, 406);

      Assert.assertEquals(0, fr.vec(1).naCnt()); // No NAs anymore
      Assert.assertEquals(23.51, fr.vec(1).at(26), 1e-1); // Row 26 was an NA, now as mean economy
      fr.delete();

      // Impute fuel economy via the "mean" method, after grouping by year.  Update in place.
      tree = "(h2o.impute hex 1 \"mean\" \"low\" [7])";
      fr = chkTree(tree, "smalldata/junit/cars.csv");
      chkDim(fr, 8, 406);

      Assert.assertEquals(0, fr.vec(1).naCnt()); // No NAs anymore
      Assert.assertEquals(
          17.69, fr.vec(1).at(26), 1e-1); // Row 26 was an NA, now as 1970 mean economy

    } finally {
      if (fr != null) fr.delete();
      Keyed.remove(Key.make("hex"));
    }
  }
Пример #3
0
 private static void assertColFrameEquals(double[] expected, Frame actual) {
   assertEquals(1, actual.numCols());
   assertEquals(expected.length, actual.numRows());
   for (int i = 0; i < expected.length; i++) {
     assertEquals("Wrong sum in row " + i, expected[i], actual.vec(0).at(i), 1e-8);
   }
 }
Пример #4
0
 private void setTransform(
     TransformType t, double[] normMul, double[] normSub, int vecStart, int n) {
   int idx = 0; // idx!=i when interactions are in play, otherwise, it's just 'i'
   for (int i = 0; i < n; ++i) {
     Vec v = _adaptedFrame.vec(vecStart + i);
     boolean isIWV = isInteractionVec(vecStart + i);
     switch (t) {
       case STANDARDIZE:
         normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = v.mean();
         break;
       case NORMALIZE:
         normMul[idx] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = v.mean();
         break;
       case DEMEAN:
         normMul[idx] = 1;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = v.mean();
         break;
       case DESCALE:
         normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1;
         normSub[idx] = 0;
         break;
       default:
         throw H2O.unimpl();
     }
     assert !Double.isNaN(normMul[idx]);
     assert !Double.isNaN(normSub[idx]);
     idx = isIWV ? (idx + nextNumericIdx(i)) : (idx + 1);
   }
 }
Пример #5
0
 private void setTransform(
     TransformType t, double[] normMul, double[] normSub, int vecStart, int n) {
   for (int i = 0; i < n; ++i) {
     Vec v = _adaptedFrame.vec(vecStart + i);
     switch (t) {
       case STANDARDIZE:
         normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         normSub[i] = v.mean();
         break;
       case NORMALIZE:
         normMul[i] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0;
         normSub[i] = v.mean();
         break;
       case DEMEAN:
         normMul[i] = 1;
         normSub[i] = v.mean();
         break;
       case DESCALE:
         normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0;
         normSub[i] = 0;
         break;
       default:
         throw H2O.unimpl();
     }
     assert !Double.isNaN(normMul[i]);
     assert !Double.isNaN(normSub[i]);
   }
 }
Пример #6
0
 private static void assertRowFrameEquals(double[] expected, Frame actual) {
   assertEquals(1, actual.numRows());
   assertEquals(expected.length, actual.numCols());
   for (int i = 0; i < expected.length; i++) {
     assertEquals("Wrong sum in column " + actual.name(i), expected[i], actual.vec(i).at(0), 1e-8);
   }
 }
Пример #7
0
  /**
   * Project each archetype into original feature space
   *
   * @param frame Original training data with m rows and n columns
   * @param destination_key Frame Id for output
   * @return Frame containing k rows and n columns, where each row corresponds to an archetype
   */
  public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) {
    final int ncols = _output._names.length;
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();
    String[][] adaptedDomme = adaptedFr.domains();
    double[][] proj = new double[_parms._k][_output._nnums + _output._ncats];

    // Categorical columns
    for (int d = 0; d < _output._ncats; d++) {
      double[][] block = _output._archetypes_raw.getCatBlock(d);
      for (int k = 0; k < _parms._k; k++)
        proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]);
    }

    // Numeric columns
    for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) {
      int ds = d - _output._ncats;
      for (int k = 0; k < _parms._k; k++) {
        double num = _output._archetypes_raw.getNum(ds, k);
        proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]);
        if (reverse_transform)
          proj[k][_output._permutation[d]] =
              proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
      }
    }

    // Convert projection of archetypes into a frame with correct domains
    Frame f =
        ArrayUtils.frame(
            (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj);
    for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]);
    return f;
  }
Пример #8
0
 // private constructor called by filterExpandedColumns
 private DataInfo(
     Key<DataInfo> selfKey,
     Frame fr,
     double[] normMul,
     double[] normSub,
     int[][] catLevels,
     int responses,
     TransformType predictor_transform,
     TransformType response_transform,
     boolean skipMissing,
     boolean imputeMissing,
     boolean weight,
     boolean offset,
     boolean fold) {
   super(selfKey);
   _offset = offset;
   _weights = weight;
   _fold = fold;
   _valid = false;
   assert predictor_transform != null;
   assert response_transform != null;
   _predictor_transform = predictor_transform;
   _response_transform = response_transform;
   _skipMissing = skipMissing;
   _imputeMissing = imputeMissing;
   _adaptedFrame = fr;
   _catOffsets = MemoryManager.malloc4(catLevels.length + 1);
   _catMissing = new int[catLevels.length];
   int s = 0;
   for (int i = 0; i < catLevels.length; ++i) {
     _catOffsets[i] = s;
     s += catLevels[i].length;
   }
   _catLvls = catLevels;
   _catOffsets[_catOffsets.length - 1] = s;
   _responses = responses;
   _cats = catLevels.length;
   _nums =
       fr.numCols() - _cats - responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0);
   _useAllFactorLevels = true;
   _catModes = new int[_cats];
   _numMeans = new double[_nums];
   _normMul = normMul;
   _normSub = normSub;
   for (int i = 0; i < _cats; i++) _catModes[i] = imputeCat(_adaptedFrame.vec(i));
   for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean();
 }
Пример #9
0
 @Test
 public void testQuantile() {
   Frame f = null;
   try {
     Frame fr =
         frame(
             ard(
                 ard(1.223292e-02),
                 ard(1.635312e-25),
                 ard(1.601522e-11),
                 ard(8.452298e-10),
                 ard(2.643733e-10),
                 ard(2.671520e-06),
                 ard(1.165381e-06),
                 ard(7.193265e-10),
                 ard(3.383532e-04),
                 ard(2.561221e-05)));
     double[] probs = new double[] {0.001, 0.005, .01, .02, .05, .10, .50, .8883, .90, .99};
     String x =
         String.format("(quantile %%%s %s \"interpolate\")", fr._key, Arrays.toString(probs));
     Val val = Exec.exec(x);
     fr.delete();
     f = val.getFrame();
     Assert.assertEquals(2, f.numCols());
     // Expected values computed as golden values from R's quantile call
     double[] exp =
         ard(
             1.4413698000016206E-13,
             7.206849000001562E-13,
             1.4413698000001489E-12,
             2.882739600000134E-12,
             7.20684900000009E-12,
             1.4413698000000017E-11,
             5.831131148999999E-07,
             3.3669567275300000E-04,
             0.00152780988,
             0.011162408988);
     for (int i = 0; i < exp.length; i++)
       Assert.assertTrue(
           "expected " + exp[i] + " got " + f.vec(1).at(i),
           water.util.MathUtils.compare(exp[i], f.vec(1).at(i), 1e-6, 1e-6));
   } finally {
     if (f != null) f.delete();
   }
 }
Пример #10
0
  public final Row extractDenseRow(double[] vals, Row row) {
    row.bad = false;
    row.rid = 0;
    row.cid = 0;
    if (row.weight == 0) return row;

    if (_skipMissing)
      for (double d : vals)
        if (Double.isNaN(d)) {
          row.bad = true;
          return row;
        }
    int nbins = 0;
    for (int i = 0; i < _cats; ++i) {
      int c = getCategoricalId(i, Double.isNaN(vals[i]) ? _catModes[i] : (int) vals[i]);
      if (c >= 0) row.binIds[nbins++] = c;
    }
    row.nBins = nbins;
    final int n = _nums;
    int numValsIdx = 0;
    for (int i = 0; i < n; ++i) {
      if (isInteractionVec(i)) {
        int offset;
        InteractionWrappedVec iwv = ((InteractionWrappedVec) _adaptedFrame.vec(_cats + i));
        int v1 = _adaptedFrame.find(iwv.v1());
        int v2 = _adaptedFrame.find(iwv.v2());
        if (v1 < _cats)
          offset = getCategoricalId(v1, Double.isNaN(vals[v1]) ? _catModes[v1] : (int) vals[v1]);
        else if (v2 < _cats)
          offset = getCategoricalId(v2, Double.isNaN(vals[v2]) ? _catModes[v1] : (int) vals[v2]);
        else offset = 0;
        row.numVals[numValsIdx + offset] = vals[_cats + i]; // essentially: vals[v1] * vals[v2])
        numValsIdx += nextNumericIdx(i);
      } else {
        double d = vals[_cats + i]; // can be NA if skipMissing() == false
        if (Double.isNaN(d)) d = _numMeans[numValsIdx];
        if (_normMul != null && _normSub != null)
          d = (d - _normSub[numValsIdx]) * _normMul[numValsIdx];
        row.numVals[numValsIdx++] = d;
      }
    }
    int off = responseChunkId(0);
    for (int i = off; i < Math.min(vals.length, off + _responses); ++i) {
      try {
        row.response[i] = vals[responseChunkId(i)];
      } catch (Throwable t) {
        throw new RuntimeException(t);
      }
      if (_normRespMul != null)
        row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i];
      if (Double.isNaN(row.response[i])) {
        row.bad = true;
        return row;
      }
    }
    return row;
  }
Пример #11
0
  @Test
  public void testCategoricalProstate() throws InterruptedException, ExecutionException {
    GLRM job = null;
    GLRMModel model = null;
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    try {
      Scope.enter();
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      GLRMParameters parms = new GLRMParameters();
      parms._train = train._key;
      parms._k = 8;
      parms._gamma_x = parms._gamma_y = 0.1;
      parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic;
      parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic;
      parms._init = GLRM.Initialization.PlusPlus;
      parms._transform = DataInfo.TransformType.STANDARDIZE;
      parms._recover_svd = false;
      parms._max_iterations = 200;

      try {
        job = new GLRM(parms);
        model = job.trainModel().get();
        Log.info(
            "Iteration "
                + model._output._iterations
                + ": Objective value = "
                + model._output._objective);
        model.score(train).delete();
        ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
        Log.info(
            "Numeric Sum of Squared Error = "
                + mm._numerr
                + "\tCategorical Misclassification Error = "
                + mm._caterr);
      } catch (Throwable t) {
        t.printStackTrace();
        throw new RuntimeException(t);
      } finally {
        job.remove();
      }
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (train != null) train.delete();
      if (model != null) model.delete();
      Scope.exit();
    }
  }
Пример #12
0
 // private constructor called by filterExpandedColumns
 private DataInfo(
     DataInfo dinfo,
     Frame fr,
     double[] normMul,
     double[] normSub,
     int[][] catLevels,
     int[] catModes) {
   _fullCatOffsets = dinfo._catOffsets;
   if (!dinfo._useAllFactorLevels) {
     _fullCatOffsets = dinfo._catOffsets.clone();
     for (int i = 0; i < _fullCatOffsets.length; ++i)
       _fullCatOffsets[i] += i; // add for the skipped zeros.
   }
   _offset = dinfo._offset;
   _weights = dinfo._weights;
   _fold = dinfo._fold;
   _valid = false;
   _interactions = dinfo._interactions;
   _interactionVecs = dinfo._interactionVecs;
   assert dinfo._predictor_transform != null;
   assert dinfo._response_transform != null;
   _predictor_transform = dinfo._predictor_transform;
   _response_transform = dinfo._response_transform;
   _skipMissing = dinfo._skipMissing;
   _imputeMissing = dinfo._imputeMissing;
   _adaptedFrame = fr;
   _catOffsets = MemoryManager.malloc4(catLevels.length + 1);
   _catMissing = new boolean[catLevels.length];
   Arrays.fill(_catMissing, !(dinfo._imputeMissing || dinfo._skipMissing));
   int s = 0;
   for (int i = 0; i < catLevels.length; ++i) {
     _catOffsets[i] = s;
     s += catLevels[i].length;
   }
   _catLvls = catLevels;
   _catOffsets[_catOffsets.length - 1] = s;
   _responses = dinfo._responses;
   _cats = catLevels.length;
   _nums =
       fr.numCols()
           - _cats
           - dinfo._responses
           - (_offset ? 1 : 0)
           - (_weights ? 1 : 0)
           - (_fold ? 1 : 0);
   _numOffsets = _nums == 0 ? new int[0] : dinfo._numOffsets.clone();
   int diff = _numOffsets.length > 0 ? _numOffsets[0] - s : 0;
   for (int i = 0; i < _numOffsets.length; i++) // need to shift everyone down by the offset!
   _numOffsets[i] -= diff;
   _useAllFactorLevels = true; // dinfo._useAllFactorLevels;
   _numMeans = new double[_nums];
   _normMul = normMul;
   _normSub = normSub;
   _catModes = catModes;
   for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean();
 }
Пример #13
0
 public final int getCategoricalIdFromInteraction(int cid, int val) {
   InteractionWrappedVec v;
   if ((v = (InteractionWrappedVec) _adaptedFrame.vec(cid)).isCategorical())
     return getCategoricalId(cid, val);
   assert v.domains() != null
       : "No domain levels found for interactions! cid: " + cid + " val: " + val;
   if (val >= _numOffsets[cid + 1]) { // previously unseen interaction (aka new domain level)
     assert _valid
         : "interaction value out of bounds, got "
             + val
             + ", next cat starts at "
             + _numOffsets[cid + 1];
     val = v.mode();
   }
   return val < 0 ? -1 : val + _numOffsets[cid];
 }
  /**
   * This method applies a stacked autoencoders model to a given dataset and make predictions
   *
   * @param ctxt JavaSparkContext
   * @param deeplearningModel Stacked Autoencoders model
   * @param test Testing dataset as a JavaRDD of labeled points
   * @return
   */
  public JavaPairRDD<Double, Double> test(
      JavaSparkContext ctxt,
      final DeepLearningModel deeplearningModel,
      JavaRDD<LabeledPoint> test,
      MLModel mlModel)
      throws MLModelBuilderException {

    Scope.enter();

    if (deeplearningModel == null) {
      throw new MLModelBuilderException("DeeplearningModel is Null");
    }

    int numberOfFeatures = mlModel.getFeatures().size();
    List<Feature> features = mlModel.getFeatures();
    String[] names = new String[numberOfFeatures + 1];
    for (int i = 0; i < numberOfFeatures; i++) {
      names[i] = features.get(i).getName();
    }
    names[numberOfFeatures] = mlModel.getResponseVariable();

    Frame testData = DeeplearningModelUtils.javaRDDToFrame(names, test);
    Frame testDataWithoutLabels = testData.subframe(0, testData.numCols() - 1);
    int numRows = (int) testDataWithoutLabels.numRows();
    Vec predictionsVector = deeplearningModel.score(testDataWithoutLabels).vec(0);
    double[] predictionValues = new double[numRows];
    for (int i = 0; i < numRows; i++) {
      predictionValues[i] = predictionsVector.at(i);
    }
    Vec labelsVector = testData.vec(testData.numCols() - 1);
    double[] labels = new double[numRows];
    for (int i = 0; i < numRows; i++) {
      labels[i] = labelsVector.at(i);
    }

    Scope.exit();

    ArrayList<Tuple2<Double, Double>> tupleList = new ArrayList<Tuple2<Double, Double>>();
    for (int i = 0; i < labels.length; i++) {
      tupleList.add(new Tuple2<Double, Double>(predictionValues[i], labels[i]));
    }

    return ctxt.parallelizePairs(tupleList);
  }
Пример #15
0
 public int getInteractionOffset(Chunk[] chunks, int cid, int rid) {
   int v1 = -1, v2 = -1;
   if (_adaptedFrame == null) {
     Vec vec1 = ((InteractionWrappedVec) chunks[cid].vec()).v1();
     Vec vec2 = ((InteractionWrappedVec) chunks[cid].vec()).v2();
     for (int i = 0; i < chunks.length; ++i) {
       if (v1 >= 0 && v2 >= 0) break; // found both vecs already
       if (v1 == -1 && chunks[i].vec() == vec1) v1 = i;
       if (v2 == -1 && chunks[i].vec() == vec2) v2 = i;
     }
   } else {
     InteractionWrappedVec iwv = ((InteractionWrappedVec) _adaptedFrame.vec(cid));
     v1 = _adaptedFrame.find(iwv.v1());
     v2 = _adaptedFrame.find(iwv.v2());
   }
   if (v1 < _cats) return (int) chunks[v1].at8(rid); // v1 is some categorical column
   else if (v2 < _cats) return (int) chunks[v2].at8(rid); // or v2 is some categorical column
   return 0; // or neither is categorical
 }
Пример #16
0
 private void checkTree(String tree, boolean expectThrow) {
   // Frame r = frame(new double[][]{{-1},{1},{2},{3},{4},{5},{6},{254}});
   // Key ahex = Key.make("a.hex");
   // Frame fr = new Frame(ahex, null, new Vec[]{r.remove(0)});
   // r.delete();
   // DKV.put(ahex, fr);
   Frame fr = parse_test_file(Key.make("a.hex"), "smalldata/iris/iris_wheader.csv");
   fr.remove(4).remove();
   try {
     Val val = Exec.exec(tree);
     Assert.assertFalse(expectThrow);
     System.out.println(val.toString());
     if (val instanceof ValFrame) {
       Frame fr2 = ((ValFrame) val)._fr;
       System.out.println(fr2.vec(0));
       fr2.remove();
     }
   } catch (IllegalArgumentException iae) {
     if (!expectThrow) throw iae;
   } finally {
     fr.delete();
   }
 }
Пример #17
0
 public Vec getOutputVec(int i) {
   return _adaptedFrame.vec(outputChunkId(i));
 }
Пример #18
0
  /**
   * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality
   * greatest to least) with all numerical columns following. The response column(s) are placed at
   * the end.
   *
   * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum
   *
   * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo
   * should not have to care how these interactions are generated. Any heuristic using the fullN
   * value should continue functioning the same.
   *
   * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a
   * list of pairs of column indices with limited enums.
   */
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold,
      Model.InteractionPair[] interactions) {
    super(Key.<DataInfo>make());
    _valid = valid != null;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _interactions = interactions;

    // create dummy InteractionWrappedVecs and shove them onto the front
    if (_interactions != null) {
      _interactionVecs = new int[_interactions.length];
      train =
          Model.makeInteractions(
                  train,
                  false,
                  _interactions,
                  _useAllFactorLevels,
                  _skipMissing,
                  predictor_transform == TransformType.STANDARDIZE)
              .add(train);
      if (valid != null)
        valid =
            Model.makeInteractions(
                    valid,
                    true,
                    _interactions,
                    _useAllFactorLevels,
                    _skipMissing,
                    predictor_transform == TransformType.STANDARDIZE)
                .add(valid); // FIXME: should be using the training subs/muls!
    }

    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;

    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[ncats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[ncats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new boolean[ncats];
    int len = _catOffsets[0] = 0;
    int interactionIdx = 0; // simple index into the _interactionVecs array

    ArrayList<Integer> interactionIds;
    if (_interactions == null) {
      interactionIds = new ArrayList<>();
      for (int i = 0; i < tvecs.length; ++i)
        if (tvecs[i] instanceof InteractionWrappedVec) {
          interactionIds.add(i);
        }
      _interactionVecs = new int[interactionIds.size()];
      for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i);
    }
    for (int i = 0; i < ncats; ++i) {
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket; // needed for test time
      if (v instanceof InteractionWrappedVec) {
        if (_interactions != null) _interactions[interactionIdx].vecIdx = i;
        _interactionVecs[interactionIdx++] =
            i; // i (and not cats[i]) because this is the index in _adaptedFrame
        _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0));
      } else
        _catOffsets[i + 1] =
            (len +=
                v.domain().length
                    - (useAllFactorLevels ? 0 : 1)
                    + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
      _catModes[i] =
          imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100;
      _permutation[i] = cats[i];
    }
    _numMeans = new double[nnums];
    _numOffsets = MemoryManager.malloc4(nnums + 1);
    _numOffsets[0] = len;
    boolean isIWV; // is InteractionWrappedVec?
    for (int i = 0; i < nnums; ++i) {
      names[i + ncats] = train._names[nums[i]];
      Vec v = train.vec(nums[i]);
      tvecs2[i + ncats] = v;
      isIWV = v instanceof InteractionWrappedVec;
      if (isIWV) {
        if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats;
        _interactionVecs[interactionIdx++] = i + ncats;
      }
      _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1));
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + ncats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }
Пример #19
0
 public boolean isInteractionVec(int colid) {
   if (null == _interactions || null == _interactionVecs) return false;
   if (_adaptedFrame != null) return _adaptedFrame.vec(colid) instanceof InteractionWrappedVec;
   else return Arrays.binarySearch(_interactionVecs, colid) >= 0;
 }
Пример #20
0
  @Test
  public void testLosses() throws InterruptedException, ExecutionException {
    long seed = 0xDECAF;
    Random rng = new Random(seed);
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS
    final GLRMParameters.Regularizer[] regs =
        new GLRMParameters.Regularizer[] {
          GLRMParameters.Regularizer.Quadratic,
          GLRMParameters.Regularizer.L1,
          GLRMParameters.Regularizer.NonNegative,
          GLRMParameters.Regularizer.OneSparse,
          GLRMParameters.Regularizer.UnitOneSparse,
          GLRMParameters.Regularizer.Simplex
        };

    Scope.enter();
    try {
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      for (GLRMParameters.Loss loss :
          new GLRMParameters.Loss[] {
            GLRMParameters.Loss.Quadratic,
            GLRMParameters.Loss.Absolute,
            GLRMParameters.Loss.Huber,
            GLRMParameters.Loss.Poisson,
            GLRMParameters.Loss.Hinge,
            GLRMParameters.Loss.Logistic
          }) {
        for (GLRMParameters.Loss multiloss :
            new GLRMParameters.Loss[] {
              GLRMParameters.Loss.Categorical, GLRMParameters.Loss.Ordinal
            }) {
          GLRMModel model = null;
          try {
            Scope.enter();
            long myseed = rng.nextLong();
            Log.info("GLRM using seed = " + myseed);

            GLRMParameters parms = new GLRMParameters();
            parms._train = train._key;
            parms._transform = DataInfo.TransformType.NONE;
            parms._k = 5;
            parms._loss = loss;
            parms._multi_loss = multiloss;
            parms._init = GLRM.Initialization.SVD;
            parms._regularization_x = regs[rng.nextInt(regs.length)];
            parms._regularization_y = regs[rng.nextInt(regs.length)];
            parms._gamma_x = Math.abs(rng.nextDouble());
            parms._gamma_y = Math.abs(rng.nextDouble());
            parms._recover_svd = false;
            parms._seed = myseed;
            parms._verbose = false;
            parms._max_iterations = 500;

            GLRM job = new GLRM(parms);
            try {
              model = job.trainModel().get();
              Log.info(
                  "Iteration "
                      + model._output._iterations
                      + ": Objective value = "
                      + model._output._objective);
              model.score(train).delete();
              ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
              Log.info(
                  "Numeric Sum of Squared Error = "
                      + mm._numerr
                      + "\tCategorical Misclassification Error = "
                      + mm._caterr);
            } catch (Throwable t) {
              throw t;
            } finally {
              job.remove();
            }
          } catch (Throwable t) {
            t.printStackTrace();
            throw new RuntimeException(t);
          } finally {
            if (model != null) model.delete();
            Scope.exit();
          }
        }
      }
    } finally {
      if (train != null) train.delete();
      Scope.exit();
    }
  }
Пример #21
0
    @Override
    protected void compute2() {
      _model = null; // Resulting model!
      try {
        Scope.enter(); // Cleanup temp keys
        init(true); // Do any expensive tests & conversions now
        // Do lock even before checking the errors, since this block is finalized by unlock
        // (not the best solution, but the code is more readable)
        _parms.read_lock_frames(SharedTree.this); // Fetch & read-lock input frames
        if (error_count() > 0)
          throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this);

        // New Model?  Or continuing from a checkpoint?
        if (_parms._checkpoint && DKV.get(_parms._model_id) != null) {
          _model = DKV.get(_dest).get();
          _model.write_lock(_key); // do not delete previous model; we are extending it
        } else { // New Model
          // Compute the zero-tree error - guessing only the class distribution.
          // MSE is stddev squared when guessing for regression.
          // For classification, guess the largest class.
          _model =
              makeModel(
                  _dest,
                  _parms,
                  initial_MSE(_response, _response),
                  _valid == null
                      ? Double.NaN
                      : initial_MSE(_response, _vresponse)); // Make a fresh model
          _model.delete_and_lock(_key); // and clear & write-lock it (smashing any prior)
          _model._output._init_f = _initialPrediction;
        }

        // Compute the response domain; makes for nicer printouts
        String[] domain = _response.domain();
        assert (_nclass > 1 && domain != null) || (_nclass == 1 && domain == null);
        if (_nclass == 1) domain = new String[] {"r"}; // For regression, give a name to class 0

        // Compute class distribution, used to for initial guesses and to
        // upsample minority classes (if asked for).
        if (_nclass > 1) { // Classification?

          // Handle imbalanced classes by stratified over/under-sampling.
          // initWorkFrame sets the modeled class distribution, and
          // model.score() corrects the probabilities back using the
          // distribution ratios
          if (_model._output.isClassifier() && _parms._balance_classes) {

            float[] trainSamplingFactors =
                new float
                    [_train
                        .lastVec()
                        .domain()
                        .length]; // leave initialized to 0 -> will be filled up below
            if (_parms._class_sampling_factors != null) {
              if (_parms._class_sampling_factors.length != _train.lastVec().domain().length)
                throw new IllegalArgumentException(
                    "class_sampling_factors must have "
                        + _train.lastVec().domain().length
                        + " elements");
              trainSamplingFactors =
                  _parms._class_sampling_factors.clone(); // clone: don't modify the original
            }
            Frame stratified =
                water.util.MRUtils.sampleFrameStratified(
                    _train,
                    _train.lastVec(),
                    _train.vec(_model._output.weightsName()),
                    trainSamplingFactors,
                    (long) (_parms._max_after_balance_size * _train.numRows()),
                    _parms._seed,
                    true,
                    false);
            if (stratified != _train) {
              _train = stratified;
              _response = stratified.vec(_parms._response_column);
              _weights = stratified.vec(_parms._weights_column);
              // Recompute distribution since the input frame was modified
              MRUtils.ClassDist cdmt2 =
                  _weights != null
                      ? new MRUtils.ClassDist(_nclass).doAll(_response, _weights)
                      : new MRUtils.ClassDist(_nclass).doAll(_response);
              _model._output._distribution = cdmt2.dist();
              _model._output._modelClassDist = cdmt2.rel_dist();
            }
          }
          Log.info("Prior class distribution: " + Arrays.toString(_model._output._priorClassDist));
          Log.info("Model class distribution: " + Arrays.toString(_model._output._modelClassDist));
        }

        // Also add to the basic working Frame these sets:
        //   nclass Vecs of current forest results (sum across all trees)
        //   nclass Vecs of working/temp data
        //   nclass Vecs of NIDs, allowing 1 tree per class

        // Current forest values: results of summing the prior M trees
        for (int i = 0; i < _nclass; i++) _train.add("Tree_" + domain[i], _response.makeZero());

        // Initial work columns.  Set-before-use in the algos.
        for (int i = 0; i < _nclass; i++) _train.add("Work_" + domain[i], _response.makeZero());

        // One Tree per class, each tree needs a NIDs.  For empty classes use a -1
        // NID signifying an empty regression tree.
        for (int i = 0; i < _nclass; i++)
          _train.add(
              "NIDs_" + domain[i],
              _response.makeCon(
                  _model._output._distribution == null
                      ? 0
                      : (_model._output._distribution[i] == 0 ? -1 : 0)));

        // Tag out rows missing the response column
        new ExcludeNAResponse().doAll(_train);

        // Variable importance: squared-error-improvement-per-variable-per-split
        _improvPerVar = new float[_ncols];

        // Sub-class tree-model-builder specific build code
        buildModel();
        done(); // Job done!
      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        if (_model != null) _model.unlock(_key);
        _parms.read_unlock_frames(SharedTree.this);
        if (_model == null) Scope.exit();
        else {
          Scope.exit(
              _model._key,
              ModelMetrics.buildKey(_model, _parms.train()),
              ModelMetrics.buildKey(_model, _parms.valid()));
        }
      }
      tryComplete();
    }
Пример #22
0
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold) {
    super(Key.<DataInfo>make());
    _valid = false;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;
    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[_cats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[_cats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new int[ncats];
    int len = _catOffsets[0] = 0;
    for (int i = 0; i < ncats; ++i) {
      _catModes[i] = imputeCat(train.vec(cats[i]));
      _permutation[i] = cats[i];
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket ? 1 : 0; // needed for test time
      _catOffsets[i + 1] =
          (len +=
              v.domain().length
                  - (useAllFactorLevels ? 0 : 1)
                  + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
    }
    _numMeans = new double[_nums];
    for (int i = 0; i < _nums; ++i) {
      names[i + _cats] = train._names[nums[i]];
      tvecs2[i + _cats] = train.vec(nums[i]);
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + _cats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }
Пример #23
0
  @Test
  public void testSetColumnLossCats() throws InterruptedException, ExecutionException {
    GLRM job = null;
    GLRMModel model = null;
    Frame train = null;
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    Scope.enter();
    try {
      train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key);
      train.remove("ID").remove();
      DKV.put(train._key, train);

      GLRMParameters parms = new GLRMParameters();
      parms._train = train._key;
      parms._k = 12;
      parms._loss = GLRMParameters.Loss.Quadratic;
      parms._multi_loss = GLRMParameters.Loss.Categorical;
      parms._loss_by_col =
          new GLRMParameters.Loss[] {
            GLRMParameters.Loss.Ordinal, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Absolute
          };
      parms._loss_by_col_idx = new int[] {3 /* DPROS */, 1 /* AGE */, 6 /* VOL */};
      parms._init = GLRM.Initialization.PlusPlus;
      parms._min_step_size = 1e-5;
      parms._recover_svd = false;
      parms._max_iterations = 2000;

      try {
        job = new GLRM(parms);
        model = job.trainModel().get();
        Log.info(
            "Iteration "
                + model._output._iterations
                + ": Objective value = "
                + model._output._objective);
        GLRMTest.checkLossbyCol(parms, model);

        model.score(train).delete();
        ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train);
        Log.info(
            "Numeric Sum of Squared Error = "
                + mm._numerr
                + "\tCategorical Misclassification Error = "
                + mm._caterr);
      } catch (Throwable t) {
        t.printStackTrace();
        throw new RuntimeException(t);
      } finally {
        job.remove();
      }

    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (train != null) train.delete();
      if (model != null) model.delete();
      Scope.exit();
    }
  }
Пример #24
0
 public static void assertValues(Frame f, String[] expValues) {
   assertValues(f.vec(0), expValues);
 }
Пример #25
0
 private void chkFr(Frame fr, int col, int row, String exp) {
   String[] dom = fr.vec(col).domain();
   Assert.assertEquals(exp, dom[(int) fr.vec(col).at8(row)]);
 }
Пример #26
0
 public Vec getOffsetVec() {
   return _adaptedFrame.vec(offsetChunkId());
 }
Пример #27
0
    /**
     * Train a Deep Learning neural net model
     *
     * @param model Input model (e.g., from initModel(), or from a previous training run)
     * @return Trained model
     */
    public final DeepLearningModel trainModel(DeepLearningModel model) {
      Frame validScoreFrame = null;
      Frame train, trainScoreFrame;
      try {
        //      if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some
        // Job's params might be uninitialized (but the restarted model's parameters are correct)
        if (model == null) {
          model = DKV.get(dest()).get();
        }
        Log.info(
            "Model category: "
                + (_parms._autoencoder
                    ? "Auto-Encoder"
                    : isClassifier() ? "Classification" : "Regression"));
        final long model_size = model.model_info().size();
        Log.info(
            "Number of model parameters (weights/biases): " + String.format("%,d", model_size));
        model.write_lock(_job);
        _job.update(0, "Setting up training data...");
        final DeepLearningParameters mp = model.model_info().get_params();

        // temporary frames of the same "name" as the orig _train/_valid (asking the parameter's
        // Key, not the actual frame)
        // Note: don't put into DKV or they would overwrite the _train/_valid frames!
        Frame tra_fr = new Frame(mp._train, _train.names(), _train.vecs());
        Frame val_fr = _valid != null ? new Frame(mp._valid, _valid.names(), _valid.vecs()) : null;

        train = tra_fr;
        if (model._output.isClassifier() && mp._balance_classes) {
          _job.update(0, "Balancing class distribution of training data...");
          float[] trainSamplingFactors =
              new float
                  [train
                      .lastVec()
                      .domain()
                      .length]; // leave initialized to 0 -> will be filled up below
          if (mp._class_sampling_factors != null) {
            if (mp._class_sampling_factors.length != train.lastVec().domain().length)
              throw new IllegalArgumentException(
                  "class_sampling_factors must have "
                      + train.lastVec().domain().length
                      + " elements");
            trainSamplingFactors =
                mp._class_sampling_factors.clone(); // clone: don't modify the original
          }
          train =
              sampleFrameStratified(
                  train,
                  train.lastVec(),
                  train.vec(model._output.weightsName()),
                  trainSamplingFactors,
                  (long) (mp._max_after_balance_size * train.numRows()),
                  mp._seed,
                  true,
                  false);
          Vec l = train.lastVec();
          Vec w = train.vec(model._output.weightsName());
          MRUtils.ClassDist cd = new MRUtils.ClassDist(l);
          model._output._modelClassDist =
              _weights != null ? cd.doAll(l, w).rel_dist() : cd.doAll(l).rel_dist();
        }
        model.training_rows = train.numRows();
        if (_weights != null && _weights.min() == 0 && _weights.max() == 1 && _weights.isInt()) {
          model.training_rows = Math.round(train.numRows() * _weights.mean());
          Log.warn(
              "Not counting "
                  + (train.numRows() - model.training_rows)
                  + " rows with weight=0 towards an epoch.");
        }
        Log.info("One epoch corresponds to " + model.training_rows + " training data rows.");
        trainScoreFrame =
            sampleFrame(
                train,
                mp._score_training_samples,
                mp._seed); // training scoring dataset is always sampled uniformly from the training
                           // dataset
        if (trainScoreFrame != train) Scope.track(trainScoreFrame);

        if (!_parms._quiet_mode)
          Log.info("Number of chunks of the training data: " + train.anyVec().nChunks());
        if (val_fr != null) {
          model.validation_rows = val_fr.numRows();
          // validation scoring dataset can be sampled in multiple ways from the given validation
          // dataset
          if (model._output.isClassifier()
              && mp._balance_classes
              && mp._score_validation_sampling
                  == DeepLearningParameters.ClassSamplingMethod.Stratified) {
            _job.update(0, "Sampling validation data (stratified)...");
            validScoreFrame =
                sampleFrameStratified(
                    val_fr,
                    val_fr.lastVec(),
                    val_fr.vec(model._output.weightsName()),
                    null,
                    mp._score_validation_samples > 0
                        ? mp._score_validation_samples
                        : val_fr.numRows(),
                    mp._seed + 1,
                    false /* no oversampling */,
                    false);
          } else {
            _job.update(0, "Sampling validation data...");
            validScoreFrame = sampleFrame(val_fr, mp._score_validation_samples, mp._seed + 1);
            if (validScoreFrame != val_fr) Scope.track(validScoreFrame);
          }
          if (!_parms._quiet_mode)
            Log.info(
                "Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks());
        }

        // Set train_samples_per_iteration size (cannot be done earlier since this depends on
        // whether stratified sampling is done)
        model.actual_train_samples_per_iteration =
            computeTrainSamplesPerIteration(mp, model.training_rows, model);
        // Determine whether shuffling is enforced
        if (mp._replicate_training_data
            && (model.actual_train_samples_per_iteration
                == model.training_rows * (mp._single_node_mode ? 1 : H2O.CLOUD.size()))
            && !mp._shuffle_training_data
            && H2O.CLOUD.size() > 1
            && !mp._reproducible) {
          if (!mp._quiet_mode)
            Log.info(
                "Enabling training data shuffling, because all nodes train on the full dataset (replicated training data).");
          mp._shuffle_training_data = true;
        }
        if (!mp._shuffle_training_data
            && model.actual_train_samples_per_iteration == model.training_rows
            && train.anyVec().nChunks() == 1) {
          if (!mp._quiet_mode)
            Log.info(
                "Enabling training data shuffling to avoid training rows in the same order over and over (no Hogwild since there's only 1 chunk).");
          mp._shuffle_training_data = true;
        }

        //        if (!mp._quiet_mode) Log.info("Initial model:\n" + model.model_info());
        long now = System.currentTimeMillis();
        model._timeLastIterationEnter = now;
        if (_parms._autoencoder) {
          _job.update(0, "Scoring null model of autoencoder...");
          if (!mp._quiet_mode) Log.info("Scoring the null model of the autoencoder.");
          model.doScoring(
              trainScoreFrame,
              validScoreFrame,
              _job._key,
              0,
              false); // get the null model reconstruction error
        }
        // put the initial version of the model into DKV
        model.update(_job);
        model.total_setup_time_ms += now - _job.start_time();
        Log.info("Total setup time: " + PrettyPrint.msecs(model.total_setup_time_ms, true));
        Log.info("Starting to train the Deep Learning model.");
        _job.update(0, "Training...");

        // main loop
        for (; ; ) {
          model.iterations++;
          model.set_model_info(
              mp._epochs == 0
                  ? model.model_info()
                  : H2O.CLOUD.size() > 1 && mp._replicate_training_data
                      ? (mp._single_node_mode
                          ? new DeepLearningTask2(
                                  _job._key,
                                  train,
                                  model.model_info(),
                                  rowFraction(train, mp, model),
                                  model.iterations)
                              .doAll(Key.make(H2O.SELF))
                              .model_info()
                          : // replicated data + single node mode
                          new DeepLearningTask2(
                                  _job._key,
                                  train,
                                  model.model_info(),
                                  rowFraction(train, mp, model),
                                  model.iterations)
                              .doAllNodes()
                              .model_info())
                      : // replicated data + multi-node mode
                      new DeepLearningTask(
                              _job._key,
                              model.model_info(),
                              rowFraction(train, mp, model),
                              model.iterations)
                          .doAll(train)
                          .model_info()); // distributed data (always in multi-node mode)
          if (stop_requested() && !timeout()) break; // cancellation
          if (!model.doScoring(
              trainScoreFrame, validScoreFrame, _job._key, model.iterations, false))
            break; // finished training (or early stopping or convergence)
          if (timeout()) break; // stop after scoring
        }

        // replace the model with the best model so far (if it's better)
        if (!stop_requested()
            && _parms._overwrite_with_best_model
            && model.actual_best_model_key != null
            && _parms._nfolds == 0) {
          DeepLearningModel best_model = DKV.getGet(model.actual_best_model_key);
          if (best_model != null
              && best_model.loss() < model.loss()
              && Arrays.equals(best_model.model_info().units, model.model_info().units)) {
            if (!_parms._quiet_mode)
              Log.info("Setting the model to be the best model so far (based on scoring history).");
            DeepLearningModelInfo mi = best_model.model_info().deep_clone();
            // Don't cheat - count full amount of training samples, since that's the amount of
            // training it took to train (without finding anything better)
            mi.set_processed_global(model.model_info().get_processed_global());
            mi.set_processed_local(model.model_info().get_processed_local());
            model.set_model_info(mi);
            model.update(_job);
            model.doScoring(trainScoreFrame, validScoreFrame, _job._key, model.iterations, true);
            assert (best_model.loss() == model.loss());
          }
        }
        // store coefficient names for future use
        // possibly change
        model.model_info().data_info().coefNames();
        if (!_parms._quiet_mode) {
          Log.info(
              "==============================================================================================================================================================================");
          if (stop_requested()) {
            Log.info("Deep Learning model training was interrupted.");
          } else {
            Log.info("Finished training the Deep Learning model.");
            Log.info(model);
          }
          Log.info(
              "==============================================================================================================================================================================");
        }
      } finally {
        if (model != null) {
          model.deleteElasticAverageModels();
          model.unlock(_job);
          if (model.actual_best_model_key != null) {
            assert (model.actual_best_model_key != model._key);
            DKV.remove(model.actual_best_model_key);
          }
        }
      }
      return model;
    }
Пример #28
0
  @Test
  public void testExpandCatsProstate() throws InterruptedException, ExecutionException {
    double[][] prostate =
        ard(
            ard(0, 71, 1, 0, 0, 4.8, 14.0, 7),
            ard(1, 70, 1, 1, 0, 8.4, 21.8, 5),
            ard(0, 73, 1, 3, 0, 10.0, 27.4, 6),
            ard(1, 68, 1, 0, 0, 6.7, 16.7, 6));
    double[][] pros_expandR =
        ard(
            ard(1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 71, 4.8, 14.0, 7),
            ard(0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 70, 8.4, 21.8, 5),
            ard(0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 73, 10.0, 27.4, 6),
            ard(1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 68, 6.7, 16.7, 6));
    String[] pros_cols =
        new String[] {"Capsule", "Age", "Race", "Dpros", "Dcaps", "PSA", "Vol", "Gleason"};
    String[][] pros_domains =
        new String[][] {
          new String[] {"No", "Yes"},
          null,
          new String[] {"Other", "White", "Black"},
          new String[] {"None", "UniLeft", "UniRight", "Bilobar"},
          new String[] {"No", "Yes"},
          null,
          null,
          null
        };
    final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS

    Frame fr = null;
    try {
      Scope.enter();
      fr = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv");
      for (int i = 0; i < cats.length; i++)
        Scope.track(fr.replace(cats[i], fr.vec(cats[i]).toCategoricalVec())._key);
      fr.remove("ID").remove();
      DKV.put(fr._key, fr);
      DataInfo dinfo =
          new DataInfo(
              Key.make(),
              fr,
              null,
              0,
              true,
              DataInfo.TransformType.NONE,
              DataInfo.TransformType.NONE,
              false,
              false,
              false, /* weights */
              false, /* offset */
              false, /* fold */
              false);

      Log.info("Original matrix:\n" + colFormat(pros_cols, "%8.7s") + ArrayUtils.pprint(prostate));
      double[][] pros_perm = ArrayUtils.permuteCols(prostate, dinfo._permutation);
      Log.info(
          "Permuted matrix:\n"
              + colFormat(pros_cols, "%8.7s", dinfo._permutation)
              + ArrayUtils.pprint(pros_perm));

      double[][] pros_exp = GLRM.expandCats(pros_perm, dinfo);
      Log.info(
          "Expanded matrix:\n"
              + colExpFormat(pros_cols, pros_domains, "%8.7s", dinfo._permutation)
              + ArrayUtils.pprint(pros_exp));
      Assert.assertArrayEquals(pros_expandR, pros_exp);
    } catch (Throwable t) {
      t.printStackTrace();
      throw new RuntimeException(t);
    } finally {
      if (fr != null) fr.delete();
      Scope.exit();
    }
  }
Пример #29
0
 public void validate(GLM glm) {
   if (_compute_p_values && _solver != Solver.AUTO && _solver != Solver.IRLSM)
     glm.error(
         "_compute_p_values",
         "P values can only be computed with IRLSM solver, go solver = " + _solver);
   if (_compute_p_values && (_lambda == null || _lambda[0] > 0))
     glm.error(
         "_compute_p_values",
         "P values can only be computed with NO REGULARIZATION (lambda = 0)");
   if (_compute_p_values && _family == Family.multinomial)
     glm.error(
         "_compute_p_values", "P values are currently not supported for family=multinomial");
   if (_compute_p_values && _non_negative)
     glm.error(
         "_compute_p_values", "P values are currently not supported for family=multinomial");
   if (_weights_column != null
       && _offset_column != null
       && _weights_column.equals(_offset_column))
     glm.error("_offset_column", "Offset must be different from weights");
   if (_lambda_search)
     if (glm.nFoldCV())
       glm.error(
           "_lambda_search",
           "Lambda search is not currently supported in conjunction with N-fold cross-validation");
   if (_nlambdas == -1) _nlambdas = 100;
   else _exactLambdas = false;
   if (_obj_reg != -1 && _obj_reg <= 0)
     glm.error("obj_reg", "Must be positive or -1 for default");
   if (_prior != -1 && _prior <= 0 || _prior >= 1)
     glm.error("_prior", "Prior must be in (exlusive) range (0,1)");
   if (_family != Family.tweedie) {
     glm.hide("_tweedie_variance_power", "Only applicable with Tweedie family");
     glm.hide("_tweedie_link_power", "Only applicable with Tweedie family");
   }
   if (_beta_constraints != null) {
     if (_family == Family.multinomial)
       glm.error(
           "beta_constraints", "beta constraints are not supported for family = multionomial");
     Frame f = _beta_constraints.get();
     if (f == null) glm.error("beta_constraints", "Missing frame for beta constraints");
     Vec v = f.vec("names");
     if (v == null)
       glm.error(
           "beta_constraints",
           "Beta constraints parameter must have names column with valid coefficient names");
     // todo: check the coefficient names
     v = f.vec("upper_bounds");
     if (v != null && !v.isNumeric())
       glm.error("beta_constraints", "upper_bounds must be numeric if present");
     v = f.vec("upper_bounds");
     v = f.vec("lower_bounds");
     if (v != null && !v.isNumeric())
       glm.error("beta_constraints", "lower_bounds must be numeric if present");
     v = f.vec("beta_given");
     if (v != null && !v.isNumeric())
       glm.error("beta_constraints", "beta_given must be numeric if present");
     v = f.vec("upper_bounds");
     v = f.vec("beta_start");
     if (v != null && !v.isNumeric())
       glm.error("beta_constraints", "beta_start must be numeric if present");
   }
   if (!_lambda_search) {
     glm.hide("_lambda_min_ratio", "only applies if lambda search is on.");
     glm.hide("_nlambdas", "only applies if lambda search is on.");
   }
   if (_link != Link.family_default) { // check we have compatible link
     switch (_family) {
       case gaussian:
         if (_link != Link.identity && _link != Link.log && _link != Link.inverse)
           throw new IllegalArgumentException(
               "Incompatible link function for selected family. Only identity, log and inverse links are allowed for family=gaussian.");
         break;
       case binomial:
         if (_link
             != Link
                 .logit) // fixme: R also allows log, but it's not clear when can be applied and
           // what should we do in case the predictions are outside of 0/1.
           throw new IllegalArgumentException(
               "Incompatible link function for selected family. Only logit is allowed for family=binomial. Got "
                   + _link);
         break;
       case poisson:
         if (_link != Link.log && _link != Link.identity)
           throw new IllegalArgumentException(
               "Incompatible link function for selected family. Only log and identity links are allowed for family=poisson.");
         break;
       case gamma:
         if (_link != Link.inverse && _link != Link.log && _link != Link.identity)
           throw new IllegalArgumentException(
               "Incompatible link function for selected family. Only inverse, log and identity links are allowed for family=gamma.");
         break;
       case tweedie:
         if (_link != Link.tweedie)
           throw new IllegalArgumentException(
               "Incompatible link function for selected family. Only tweedie link allowed for family=tweedie.");
         break;
       case multinomial:
         if (_link != Link.multinomial)
           throw new IllegalArgumentException(
               "Incompatible link function for selected family. Only multinomial link allowed for family=multinomial.");
         break;
       default:
         H2O.fail();
     }
   }
 }
Пример #30
0
 private void chkFr(Frame fr, int col, int row, double exp, double tol) {
   if (Double.isNaN(exp)) Assert.assertTrue(fr.vec(col).isNA(row));
   else Assert.assertEquals(exp, fr.vec(col).at(row), tol);
 }