@Test public void testDomains() { Frame frame = parse_test_file("smalldata/junit/weather.csv"); for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) { Vec v = frame.vec(s); Vec newV = v.toCategoricalVec(); frame.remove(s); frame.add(s, newV); v.remove(); } DKV.put(frame); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 10; AggregatorModel agg = new Aggregator(parms).trainModel().get(); Frame output = agg._output._output_frame.get(); Assert.assertTrue(output.numRows() < 0.5 * frame.numRows()); boolean same = true; for (int i = 0; i < frame.numCols(); ++i) { if (frame.vec(i).isCategorical()) { same = (frame.domains()[i].length == output.domains()[i].length); if (!same) break; } } frame.remove(); output.remove(); agg.remove(); Assert.assertFalse(same); }
@Test public void testImpute() { Frame fr = null; try { // Impute fuel economy via the "mean" method, no. String tree = "(h2o.impute hex 1 \"mean\" \"low\" [])"; fr = chkTree(tree, "smalldata/junit/cars.csv"); chkDim(fr, 8, 406); Assert.assertEquals(0, fr.vec(1).naCnt()); // No NAs anymore Assert.assertEquals(23.51, fr.vec(1).at(26), 1e-1); // Row 26 was an NA, now as mean economy fr.delete(); // Impute fuel economy via the "mean" method, after grouping by year. Update in place. tree = "(h2o.impute hex 1 \"mean\" \"low\" [7])"; fr = chkTree(tree, "smalldata/junit/cars.csv"); chkDim(fr, 8, 406); Assert.assertEquals(0, fr.vec(1).naCnt()); // No NAs anymore Assert.assertEquals( 17.69, fr.vec(1).at(26), 1e-1); // Row 26 was an NA, now as 1970 mean economy } finally { if (fr != null) fr.delete(); Keyed.remove(Key.make("hex")); } }
private static void assertColFrameEquals(double[] expected, Frame actual) { assertEquals(1, actual.numCols()); assertEquals(expected.length, actual.numRows()); for (int i = 0; i < expected.length; i++) { assertEquals("Wrong sum in row " + i, expected[i], actual.vec(0).at(i), 1e-8); } }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { int idx = 0; // idx!=i when interactions are in play, otherwise, it's just 'i' for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); boolean isIWV = isInteractionVec(vecStart + i); switch (t) { case STANDARDIZE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case NORMALIZE: normMul[idx] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DEMEAN: normMul[idx] = 1; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DESCALE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[idx]); assert !Double.isNaN(normSub[idx]); idx = isIWV ? (idx + nextNumericIdx(i)) : (idx + 1); } }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); switch (t) { case STANDARDIZE: normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[i] = v.mean(); break; case NORMALIZE: normMul[i] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; normSub[i] = v.mean(); break; case DEMEAN: normMul[i] = 1; normSub[i] = v.mean(); break; case DESCALE: normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[i] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[i]); assert !Double.isNaN(normSub[i]); } }
private static void assertRowFrameEquals(double[] expected, Frame actual) { assertEquals(1, actual.numRows()); assertEquals(expected.length, actual.numCols()); for (int i = 0; i < expected.length; i++) { assertEquals("Wrong sum in column " + actual.name(i), expected[i], actual.vec(i).at(0), 1e-8); } }
/** * Project each archetype into original feature space * * @param frame Original training data with m rows and n columns * @param destination_key Frame Id for output * @return Frame containing k rows and n columns, where each row corresponds to an archetype */ public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) { final int ncols = _output._names.length; Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); String[][] adaptedDomme = adaptedFr.domains(); double[][] proj = new double[_parms._k][_output._nnums + _output._ncats]; // Categorical columns for (int d = 0; d < _output._ncats; d++) { double[][] block = _output._archetypes_raw.getCatBlock(d); for (int k = 0; k < _parms._k; k++) proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]); } // Numeric columns for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) { int ds = d - _output._ncats; for (int k = 0; k < _parms._k; k++) { double num = _output._archetypes_raw.getNum(ds, k); proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]); if (reverse_transform) proj[k][_output._permutation[d]] = proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds]; } } // Convert projection of archetypes into a frame with correct domains Frame f = ArrayUtils.frame( (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj); for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]); return f; }
// private constructor called by filterExpandedColumns private DataInfo( Key<DataInfo> selfKey, Frame fr, double[] normMul, double[] normSub, int[][] catLevels, int responses, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean weight, boolean offset, boolean fold) { super(selfKey); _offset = offset; _weights = weight; _fold = fold; _valid = false; assert predictor_transform != null; assert response_transform != null; _predictor_transform = predictor_transform; _response_transform = response_transform; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new int[catLevels.length]; int s = 0; for (int i = 0; i < catLevels.length; ++i) { _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length - 1] = s; _responses = responses; _cats = catLevels.length; _nums = fr.numCols() - _cats - responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0); _useAllFactorLevels = true; _catModes = new int[_cats]; _numMeans = new double[_nums]; _normMul = normMul; _normSub = normSub; for (int i = 0; i < _cats; i++) _catModes[i] = imputeCat(_adaptedFrame.vec(i)); for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean(); }
@Test public void testQuantile() { Frame f = null; try { Frame fr = frame( ard( ard(1.223292e-02), ard(1.635312e-25), ard(1.601522e-11), ard(8.452298e-10), ard(2.643733e-10), ard(2.671520e-06), ard(1.165381e-06), ard(7.193265e-10), ard(3.383532e-04), ard(2.561221e-05))); double[] probs = new double[] {0.001, 0.005, .01, .02, .05, .10, .50, .8883, .90, .99}; String x = String.format("(quantile %%%s %s \"interpolate\")", fr._key, Arrays.toString(probs)); Val val = Exec.exec(x); fr.delete(); f = val.getFrame(); Assert.assertEquals(2, f.numCols()); // Expected values computed as golden values from R's quantile call double[] exp = ard( 1.4413698000016206E-13, 7.206849000001562E-13, 1.4413698000001489E-12, 2.882739600000134E-12, 7.20684900000009E-12, 1.4413698000000017E-11, 5.831131148999999E-07, 3.3669567275300000E-04, 0.00152780988, 0.011162408988); for (int i = 0; i < exp.length; i++) Assert.assertTrue( "expected " + exp[i] + " got " + f.vec(1).at(i), water.util.MathUtils.compare(exp[i], f.vec(1).at(i), 1e-6, 1e-6)); } finally { if (f != null) f.delete(); } }
public final Row extractDenseRow(double[] vals, Row row) { row.bad = false; row.rid = 0; row.cid = 0; if (row.weight == 0) return row; if (_skipMissing) for (double d : vals) if (Double.isNaN(d)) { row.bad = true; return row; } int nbins = 0; for (int i = 0; i < _cats; ++i) { int c = getCategoricalId(i, Double.isNaN(vals[i]) ? _catModes[i] : (int) vals[i]); if (c >= 0) row.binIds[nbins++] = c; } row.nBins = nbins; final int n = _nums; int numValsIdx = 0; for (int i = 0; i < n; ++i) { if (isInteractionVec(i)) { int offset; InteractionWrappedVec iwv = ((InteractionWrappedVec) _adaptedFrame.vec(_cats + i)); int v1 = _adaptedFrame.find(iwv.v1()); int v2 = _adaptedFrame.find(iwv.v2()); if (v1 < _cats) offset = getCategoricalId(v1, Double.isNaN(vals[v1]) ? _catModes[v1] : (int) vals[v1]); else if (v2 < _cats) offset = getCategoricalId(v2, Double.isNaN(vals[v2]) ? _catModes[v1] : (int) vals[v2]); else offset = 0; row.numVals[numValsIdx + offset] = vals[_cats + i]; // essentially: vals[v1] * vals[v2]) numValsIdx += nextNumericIdx(i); } else { double d = vals[_cats + i]; // can be NA if skipMissing() == false if (Double.isNaN(d)) d = _numMeans[numValsIdx]; if (_normMul != null && _normSub != null) d = (d - _normSub[numValsIdx]) * _normMul[numValsIdx]; row.numVals[numValsIdx++] = d; } } int off = responseChunkId(0); for (int i = off; i < Math.min(vals.length, off + _responses); ++i) { try { row.response[i] = vals[responseChunkId(i)]; } catch (Throwable t) { throw new RuntimeException(t); } if (_normRespMul != null) row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i]; if (Double.isNaN(row.response[i])) { row.bad = true; return row; } } return row; }
@Test public void testCategoricalProstate() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS try { Scope.enter(); train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 8; parms._gamma_x = parms._gamma_y = 0.1; parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.Quadratic; parms._init = GLRM.Initialization.PlusPlus; parms._transform = DataInfo.TransformType.STANDARDIZE; parms._recover_svd = false; parms._max_iterations = 200; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
// private constructor called by filterExpandedColumns private DataInfo( DataInfo dinfo, Frame fr, double[] normMul, double[] normSub, int[][] catLevels, int[] catModes) { _fullCatOffsets = dinfo._catOffsets; if (!dinfo._useAllFactorLevels) { _fullCatOffsets = dinfo._catOffsets.clone(); for (int i = 0; i < _fullCatOffsets.length; ++i) _fullCatOffsets[i] += i; // add for the skipped zeros. } _offset = dinfo._offset; _weights = dinfo._weights; _fold = dinfo._fold; _valid = false; _interactions = dinfo._interactions; _interactionVecs = dinfo._interactionVecs; assert dinfo._predictor_transform != null; assert dinfo._response_transform != null; _predictor_transform = dinfo._predictor_transform; _response_transform = dinfo._response_transform; _skipMissing = dinfo._skipMissing; _imputeMissing = dinfo._imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new boolean[catLevels.length]; Arrays.fill(_catMissing, !(dinfo._imputeMissing || dinfo._skipMissing)); int s = 0; for (int i = 0; i < catLevels.length; ++i) { _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length - 1] = s; _responses = dinfo._responses; _cats = catLevels.length; _nums = fr.numCols() - _cats - dinfo._responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0); _numOffsets = _nums == 0 ? new int[0] : dinfo._numOffsets.clone(); int diff = _numOffsets.length > 0 ? _numOffsets[0] - s : 0; for (int i = 0; i < _numOffsets.length; i++) // need to shift everyone down by the offset! _numOffsets[i] -= diff; _useAllFactorLevels = true; // dinfo._useAllFactorLevels; _numMeans = new double[_nums]; _normMul = normMul; _normSub = normSub; _catModes = catModes; for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean(); }
public final int getCategoricalIdFromInteraction(int cid, int val) { InteractionWrappedVec v; if ((v = (InteractionWrappedVec) _adaptedFrame.vec(cid)).isCategorical()) return getCategoricalId(cid, val); assert v.domains() != null : "No domain levels found for interactions! cid: " + cid + " val: " + val; if (val >= _numOffsets[cid + 1]) { // previously unseen interaction (aka new domain level) assert _valid : "interaction value out of bounds, got " + val + ", next cat starts at " + _numOffsets[cid + 1]; val = v.mode(); } return val < 0 ? -1 : val + _numOffsets[cid]; }
/** * This method applies a stacked autoencoders model to a given dataset and make predictions * * @param ctxt JavaSparkContext * @param deeplearningModel Stacked Autoencoders model * @param test Testing dataset as a JavaRDD of labeled points * @return */ public JavaPairRDD<Double, Double> test( JavaSparkContext ctxt, final DeepLearningModel deeplearningModel, JavaRDD<LabeledPoint> test, MLModel mlModel) throws MLModelBuilderException { Scope.enter(); if (deeplearningModel == null) { throw new MLModelBuilderException("DeeplearningModel is Null"); } int numberOfFeatures = mlModel.getFeatures().size(); List<Feature> features = mlModel.getFeatures(); String[] names = new String[numberOfFeatures + 1]; for (int i = 0; i < numberOfFeatures; i++) { names[i] = features.get(i).getName(); } names[numberOfFeatures] = mlModel.getResponseVariable(); Frame testData = DeeplearningModelUtils.javaRDDToFrame(names, test); Frame testDataWithoutLabels = testData.subframe(0, testData.numCols() - 1); int numRows = (int) testDataWithoutLabels.numRows(); Vec predictionsVector = deeplearningModel.score(testDataWithoutLabels).vec(0); double[] predictionValues = new double[numRows]; for (int i = 0; i < numRows; i++) { predictionValues[i] = predictionsVector.at(i); } Vec labelsVector = testData.vec(testData.numCols() - 1); double[] labels = new double[numRows]; for (int i = 0; i < numRows; i++) { labels[i] = labelsVector.at(i); } Scope.exit(); ArrayList<Tuple2<Double, Double>> tupleList = new ArrayList<Tuple2<Double, Double>>(); for (int i = 0; i < labels.length; i++) { tupleList.add(new Tuple2<Double, Double>(predictionValues[i], labels[i])); } return ctxt.parallelizePairs(tupleList); }
public int getInteractionOffset(Chunk[] chunks, int cid, int rid) { int v1 = -1, v2 = -1; if (_adaptedFrame == null) { Vec vec1 = ((InteractionWrappedVec) chunks[cid].vec()).v1(); Vec vec2 = ((InteractionWrappedVec) chunks[cid].vec()).v2(); for (int i = 0; i < chunks.length; ++i) { if (v1 >= 0 && v2 >= 0) break; // found both vecs already if (v1 == -1 && chunks[i].vec() == vec1) v1 = i; if (v2 == -1 && chunks[i].vec() == vec2) v2 = i; } } else { InteractionWrappedVec iwv = ((InteractionWrappedVec) _adaptedFrame.vec(cid)); v1 = _adaptedFrame.find(iwv.v1()); v2 = _adaptedFrame.find(iwv.v2()); } if (v1 < _cats) return (int) chunks[v1].at8(rid); // v1 is some categorical column else if (v2 < _cats) return (int) chunks[v2].at8(rid); // or v2 is some categorical column return 0; // or neither is categorical }
private void checkTree(String tree, boolean expectThrow) { // Frame r = frame(new double[][]{{-1},{1},{2},{3},{4},{5},{6},{254}}); // Key ahex = Key.make("a.hex"); // Frame fr = new Frame(ahex, null, new Vec[]{r.remove(0)}); // r.delete(); // DKV.put(ahex, fr); Frame fr = parse_test_file(Key.make("a.hex"), "smalldata/iris/iris_wheader.csv"); fr.remove(4).remove(); try { Val val = Exec.exec(tree); Assert.assertFalse(expectThrow); System.out.println(val.toString()); if (val instanceof ValFrame) { Frame fr2 = ((ValFrame) val)._fr; System.out.println(fr2.vec(0)); fr2.remove(); } } catch (IllegalArgumentException iae) { if (!expectThrow) throw iae; } finally { fr.delete(); } }
public Vec getOutputVec(int i) { return _adaptedFrame.vec(outputChunkId(i)); }
/** * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality * greatest to least) with all numerical columns following. The response column(s) are placed at * the end. * * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum * * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo * should not have to care how these interactions are generated. Any heuristic using the fullN * value should continue functioning the same. * * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a * list of pairs of column indices with limited enums. */ public DataInfo( Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold, Model.InteractionPair[] interactions) { super(Key.<DataInfo>make()); _valid = valid != null; assert predictor_transform != null; assert response_transform != null; _offset = offset; _weights = weight; _fold = fold; assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true"; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _interactions = interactions; // create dummy InteractionWrappedVecs and shove them onto the front if (_interactions != null) { _interactionVecs = new int[_interactions.length]; train = Model.makeInteractions( train, false, _interactions, _useAllFactorLevels, _skipMissing, predictor_transform == TransformType.STANDARDIZE) .add(train); if (valid != null) valid = Model.makeInteractions( valid, true, _interactions, _useAllFactorLevels, _skipMissing, predictor_transform == TransformType.STANDARDIZE) .add(valid); // FIXME: should be using the training subs/muls! } _permutation = new int[train.numCols()]; final Vec[] tvecs = train.vecs(); // Count categorical-vs-numerical final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0); int[] nums = MemoryManager.malloc4(n); int[] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for (int i = 0; i < n; ++i) if (tvecs[i].isCategorical()) cats[ncats++] = i; else nums[nnums++] = i; _nums = nnums; _cats = ncats; _catLvls = new int[ncats][]; // sort the cats in the decreasing order according to their size for (int i = 0; i < ncats; ++i) for (int j = i + 1; j < ncats; ++j) if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } String[] names = new String[train.numCols()]; Vec[] tvecs2 = new Vec[train.numCols()]; // Compute the cardinality of each cat _catModes = new int[ncats]; _catOffsets = MemoryManager.malloc4(ncats + 1); _catMissing = new boolean[ncats]; int len = _catOffsets[0] = 0; int interactionIdx = 0; // simple index into the _interactionVecs array ArrayList<Integer> interactionIds; if (_interactions == null) { interactionIds = new ArrayList<>(); for (int i = 0; i < tvecs.length; ++i) if (tvecs[i] instanceof InteractionWrappedVec) { interactionIds.add(i); } _interactionVecs = new int[interactionIds.size()]; for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i); } for (int i = 0; i < ncats; ++i) { names[i] = train._names[cats[i]]; Vec v = (tvecs2[i] = tvecs[cats[i]]); _catMissing[i] = missingBucket; // needed for test time if (v instanceof InteractionWrappedVec) { if (_interactions != null) _interactions[interactionIdx].vecIdx = i; _interactionVecs[interactionIdx++] = i; // i (and not cats[i]) because this is the index in _adaptedFrame _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0)); } else _catOffsets[i + 1] = (len += v.domain().length - (useAllFactorLevels ? 0 : 1) + (missingBucket ? 1 : 0)); // missing values turn into a new factor level _catModes[i] = imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100; _permutation[i] = cats[i]; } _numMeans = new double[nnums]; _numOffsets = MemoryManager.malloc4(nnums + 1); _numOffsets[0] = len; boolean isIWV; // is InteractionWrappedVec? for (int i = 0; i < nnums; ++i) { names[i + ncats] = train._names[nums[i]]; Vec v = train.vec(nums[i]); tvecs2[i + ncats] = v; isIWV = v instanceof InteractionWrappedVec; if (isIWV) { if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats; _interactionVecs[interactionIdx++] = i + ncats; } _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1)); _numMeans[i] = train.vec(nums[i]).mean(); _permutation[i + ncats] = nums[i]; } for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0); i < names.length; ++i) { names[i] = train._names[i]; tvecs2[i] = train.vec(i); } _adaptedFrame = new Frame(names, tvecs2); train.restructure(names, tvecs2); if (valid != null) valid.restructure(names, valid.vecs(names)); // _adaptedFrame = train; setPredictorTransform(predictor_transform); if (_responses > 0) setResponseTransform(response_transform); }
public boolean isInteractionVec(int colid) { if (null == _interactions || null == _interactionVecs) return false; if (_adaptedFrame != null) return _adaptedFrame.vec(colid) instanceof InteractionWrappedVec; else return Arrays.binarySearch(_interactionVecs, colid) >= 0; }
@Test public void testLosses() throws InterruptedException, ExecutionException { long seed = 0xDECAF; Random rng = new Random(seed); Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS final GLRMParameters.Regularizer[] regs = new GLRMParameters.Regularizer[] { GLRMParameters.Regularizer.Quadratic, GLRMParameters.Regularizer.L1, GLRMParameters.Regularizer.NonNegative, GLRMParameters.Regularizer.OneSparse, GLRMParameters.Regularizer.UnitOneSparse, GLRMParameters.Regularizer.Simplex }; Scope.enter(); try { train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); for (GLRMParameters.Loss loss : new GLRMParameters.Loss[] { GLRMParameters.Loss.Quadratic, GLRMParameters.Loss.Absolute, GLRMParameters.Loss.Huber, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Hinge, GLRMParameters.Loss.Logistic }) { for (GLRMParameters.Loss multiloss : new GLRMParameters.Loss[] { GLRMParameters.Loss.Categorical, GLRMParameters.Loss.Ordinal }) { GLRMModel model = null; try { Scope.enter(); long myseed = rng.nextLong(); Log.info("GLRM using seed = " + myseed); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._transform = DataInfo.TransformType.NONE; parms._k = 5; parms._loss = loss; parms._multi_loss = multiloss; parms._init = GLRM.Initialization.SVD; parms._regularization_x = regs[rng.nextInt(regs.length)]; parms._regularization_y = regs[rng.nextInt(regs.length)]; parms._gamma_x = Math.abs(rng.nextDouble()); parms._gamma_y = Math.abs(rng.nextDouble()); parms._recover_svd = false; parms._seed = myseed; parms._verbose = false; parms._max_iterations = 500; GLRM job = new GLRM(parms); try { model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { throw t; } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (model != null) model.delete(); Scope.exit(); } } } } finally { if (train != null) train.delete(); Scope.exit(); } }
@Override protected void compute2() { _model = null; // Resulting model! try { Scope.enter(); // Cleanup temp keys init(true); // Do any expensive tests & conversions now // Do lock even before checking the errors, since this block is finalized by unlock // (not the best solution, but the code is more readable) _parms.read_lock_frames(SharedTree.this); // Fetch & read-lock input frames if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this); // New Model? Or continuing from a checkpoint? if (_parms._checkpoint && DKV.get(_parms._model_id) != null) { _model = DKV.get(_dest).get(); _model.write_lock(_key); // do not delete previous model; we are extending it } else { // New Model // Compute the zero-tree error - guessing only the class distribution. // MSE is stddev squared when guessing for regression. // For classification, guess the largest class. _model = makeModel( _dest, _parms, initial_MSE(_response, _response), _valid == null ? Double.NaN : initial_MSE(_response, _vresponse)); // Make a fresh model _model.delete_and_lock(_key); // and clear & write-lock it (smashing any prior) _model._output._init_f = _initialPrediction; } // Compute the response domain; makes for nicer printouts String[] domain = _response.domain(); assert (_nclass > 1 && domain != null) || (_nclass == 1 && domain == null); if (_nclass == 1) domain = new String[] {"r"}; // For regression, give a name to class 0 // Compute class distribution, used to for initial guesses and to // upsample minority classes (if asked for). if (_nclass > 1) { // Classification? // Handle imbalanced classes by stratified over/under-sampling. // initWorkFrame sets the modeled class distribution, and // model.score() corrects the probabilities back using the // distribution ratios if (_model._output.isClassifier() && _parms._balance_classes) { float[] trainSamplingFactors = new float [_train .lastVec() .domain() .length]; // leave initialized to 0 -> will be filled up below if (_parms._class_sampling_factors != null) { if (_parms._class_sampling_factors.length != _train.lastVec().domain().length) throw new IllegalArgumentException( "class_sampling_factors must have " + _train.lastVec().domain().length + " elements"); trainSamplingFactors = _parms._class_sampling_factors.clone(); // clone: don't modify the original } Frame stratified = water.util.MRUtils.sampleFrameStratified( _train, _train.lastVec(), _train.vec(_model._output.weightsName()), trainSamplingFactors, (long) (_parms._max_after_balance_size * _train.numRows()), _parms._seed, true, false); if (stratified != _train) { _train = stratified; _response = stratified.vec(_parms._response_column); _weights = stratified.vec(_parms._weights_column); // Recompute distribution since the input frame was modified MRUtils.ClassDist cdmt2 = _weights != null ? new MRUtils.ClassDist(_nclass).doAll(_response, _weights) : new MRUtils.ClassDist(_nclass).doAll(_response); _model._output._distribution = cdmt2.dist(); _model._output._modelClassDist = cdmt2.rel_dist(); } } Log.info("Prior class distribution: " + Arrays.toString(_model._output._priorClassDist)); Log.info("Model class distribution: " + Arrays.toString(_model._output._modelClassDist)); } // Also add to the basic working Frame these sets: // nclass Vecs of current forest results (sum across all trees) // nclass Vecs of working/temp data // nclass Vecs of NIDs, allowing 1 tree per class // Current forest values: results of summing the prior M trees for (int i = 0; i < _nclass; i++) _train.add("Tree_" + domain[i], _response.makeZero()); // Initial work columns. Set-before-use in the algos. for (int i = 0; i < _nclass; i++) _train.add("Work_" + domain[i], _response.makeZero()); // One Tree per class, each tree needs a NIDs. For empty classes use a -1 // NID signifying an empty regression tree. for (int i = 0; i < _nclass; i++) _train.add( "NIDs_" + domain[i], _response.makeCon( _model._output._distribution == null ? 0 : (_model._output._distribution[i] == 0 ? -1 : 0))); // Tag out rows missing the response column new ExcludeNAResponse().doAll(_train); // Variable importance: squared-error-improvement-per-variable-per-split _improvPerVar = new float[_ncols]; // Sub-class tree-model-builder specific build code buildModel(); done(); // Job done! } catch (Throwable t) { Job thisJob = DKV.getGet(_key); if (thisJob._state == JobState.CANCELLED) { Log.info("Job cancelled by user."); } else { t.printStackTrace(); failed(t); throw t; } } finally { if (_model != null) _model.unlock(_key); _parms.read_unlock_frames(SharedTree.this); if (_model == null) Scope.exit(); else { Scope.exit( _model._key, ModelMetrics.buildKey(_model, _parms.train()), ModelMetrics.buildKey(_model, _parms.valid())); } } tryComplete(); }
public DataInfo( Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { super(Key.<DataInfo>make()); _valid = false; assert predictor_transform != null; assert response_transform != null; _offset = offset; _weights = weight; _fold = fold; assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true"; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _permutation = new int[train.numCols()]; final Vec[] tvecs = train.vecs(); // Count categorical-vs-numerical final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0); int[] nums = MemoryManager.malloc4(n); int[] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for (int i = 0; i < n; ++i) if (tvecs[i].isCategorical()) cats[ncats++] = i; else nums[nnums++] = i; _nums = nnums; _cats = ncats; _catLvls = new int[_cats][]; // sort the cats in the decreasing order according to their size for (int i = 0; i < ncats; ++i) for (int j = i + 1; j < ncats; ++j) if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } String[] names = new String[train.numCols()]; Vec[] tvecs2 = new Vec[train.numCols()]; // Compute the cardinality of each cat _catModes = new int[_cats]; _catOffsets = MemoryManager.malloc4(ncats + 1); _catMissing = new int[ncats]; int len = _catOffsets[0] = 0; for (int i = 0; i < ncats; ++i) { _catModes[i] = imputeCat(train.vec(cats[i])); _permutation[i] = cats[i]; names[i] = train._names[cats[i]]; Vec v = (tvecs2[i] = tvecs[cats[i]]); _catMissing[i] = missingBucket ? 1 : 0; // needed for test time _catOffsets[i + 1] = (len += v.domain().length - (useAllFactorLevels ? 0 : 1) + (missingBucket ? 1 : 0)); // missing values turn into a new factor level } _numMeans = new double[_nums]; for (int i = 0; i < _nums; ++i) { names[i + _cats] = train._names[nums[i]]; tvecs2[i + _cats] = train.vec(nums[i]); _numMeans[i] = train.vec(nums[i]).mean(); _permutation[i + _cats] = nums[i]; } for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0); i < names.length; ++i) { names[i] = train._names[i]; tvecs2[i] = train.vec(i); } _adaptedFrame = new Frame(names, tvecs2); train.restructure(names, tvecs2); if (valid != null) valid.restructure(names, valid.vecs(names)); // _adaptedFrame = train; setPredictorTransform(predictor_transform); if (_responses > 0) setResponseTransform(response_transform); }
@Test public void testSetColumnLossCats() throws InterruptedException, ExecutionException { GLRM job = null; GLRMModel model = null; Frame train = null; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS Scope.enter(); try { train = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(train.replace(cats[i], train.vec(cats[i]).toCategoricalVec())._key); train.remove("ID").remove(); DKV.put(train._key, train); GLRMParameters parms = new GLRMParameters(); parms._train = train._key; parms._k = 12; parms._loss = GLRMParameters.Loss.Quadratic; parms._multi_loss = GLRMParameters.Loss.Categorical; parms._loss_by_col = new GLRMParameters.Loss[] { GLRMParameters.Loss.Ordinal, GLRMParameters.Loss.Poisson, GLRMParameters.Loss.Absolute }; parms._loss_by_col_idx = new int[] {3 /* DPROS */, 1 /* AGE */, 6 /* VOL */}; parms._init = GLRM.Initialization.PlusPlus; parms._min_step_size = 1e-5; parms._recover_svd = false; parms._max_iterations = 2000; try { job = new GLRM(parms); model = job.trainModel().get(); Log.info( "Iteration " + model._output._iterations + ": Objective value = " + model._output._objective); GLRMTest.checkLossbyCol(parms, model); model.score(train).delete(); ModelMetricsGLRM mm = (ModelMetricsGLRM) ModelMetrics.getFromDKV(model, train); Log.info( "Numeric Sum of Squared Error = " + mm._numerr + "\tCategorical Misclassification Error = " + mm._caterr); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { job.remove(); } } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (train != null) train.delete(); if (model != null) model.delete(); Scope.exit(); } }
public static void assertValues(Frame f, String[] expValues) { assertValues(f.vec(0), expValues); }
private void chkFr(Frame fr, int col, int row, String exp) { String[] dom = fr.vec(col).domain(); Assert.assertEquals(exp, dom[(int) fr.vec(col).at8(row)]); }
public Vec getOffsetVec() { return _adaptedFrame.vec(offsetChunkId()); }
/** * Train a Deep Learning neural net model * * @param model Input model (e.g., from initModel(), or from a previous training run) * @return Trained model */ public final DeepLearningModel trainModel(DeepLearningModel model) { Frame validScoreFrame = null; Frame train, trainScoreFrame; try { // if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some // Job's params might be uninitialized (but the restarted model's parameters are correct) if (model == null) { model = DKV.get(dest()).get(); } Log.info( "Model category: " + (_parms._autoencoder ? "Auto-Encoder" : isClassifier() ? "Classification" : "Regression")); final long model_size = model.model_info().size(); Log.info( "Number of model parameters (weights/biases): " + String.format("%,d", model_size)); model.write_lock(_job); _job.update(0, "Setting up training data..."); final DeepLearningParameters mp = model.model_info().get_params(); // temporary frames of the same "name" as the orig _train/_valid (asking the parameter's // Key, not the actual frame) // Note: don't put into DKV or they would overwrite the _train/_valid frames! Frame tra_fr = new Frame(mp._train, _train.names(), _train.vecs()); Frame val_fr = _valid != null ? new Frame(mp._valid, _valid.names(), _valid.vecs()) : null; train = tra_fr; if (model._output.isClassifier() && mp._balance_classes) { _job.update(0, "Balancing class distribution of training data..."); float[] trainSamplingFactors = new float [train .lastVec() .domain() .length]; // leave initialized to 0 -> will be filled up below if (mp._class_sampling_factors != null) { if (mp._class_sampling_factors.length != train.lastVec().domain().length) throw new IllegalArgumentException( "class_sampling_factors must have " + train.lastVec().domain().length + " elements"); trainSamplingFactors = mp._class_sampling_factors.clone(); // clone: don't modify the original } train = sampleFrameStratified( train, train.lastVec(), train.vec(model._output.weightsName()), trainSamplingFactors, (long) (mp._max_after_balance_size * train.numRows()), mp._seed, true, false); Vec l = train.lastVec(); Vec w = train.vec(model._output.weightsName()); MRUtils.ClassDist cd = new MRUtils.ClassDist(l); model._output._modelClassDist = _weights != null ? cd.doAll(l, w).rel_dist() : cd.doAll(l).rel_dist(); } model.training_rows = train.numRows(); if (_weights != null && _weights.min() == 0 && _weights.max() == 1 && _weights.isInt()) { model.training_rows = Math.round(train.numRows() * _weights.mean()); Log.warn( "Not counting " + (train.numRows() - model.training_rows) + " rows with weight=0 towards an epoch."); } Log.info("One epoch corresponds to " + model.training_rows + " training data rows."); trainScoreFrame = sampleFrame( train, mp._score_training_samples, mp._seed); // training scoring dataset is always sampled uniformly from the training // dataset if (trainScoreFrame != train) Scope.track(trainScoreFrame); if (!_parms._quiet_mode) Log.info("Number of chunks of the training data: " + train.anyVec().nChunks()); if (val_fr != null) { model.validation_rows = val_fr.numRows(); // validation scoring dataset can be sampled in multiple ways from the given validation // dataset if (model._output.isClassifier() && mp._balance_classes && mp._score_validation_sampling == DeepLearningParameters.ClassSamplingMethod.Stratified) { _job.update(0, "Sampling validation data (stratified)..."); validScoreFrame = sampleFrameStratified( val_fr, val_fr.lastVec(), val_fr.vec(model._output.weightsName()), null, mp._score_validation_samples > 0 ? mp._score_validation_samples : val_fr.numRows(), mp._seed + 1, false /* no oversampling */, false); } else { _job.update(0, "Sampling validation data..."); validScoreFrame = sampleFrame(val_fr, mp._score_validation_samples, mp._seed + 1); if (validScoreFrame != val_fr) Scope.track(validScoreFrame); } if (!_parms._quiet_mode) Log.info( "Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks()); } // Set train_samples_per_iteration size (cannot be done earlier since this depends on // whether stratified sampling is done) model.actual_train_samples_per_iteration = computeTrainSamplesPerIteration(mp, model.training_rows, model); // Determine whether shuffling is enforced if (mp._replicate_training_data && (model.actual_train_samples_per_iteration == model.training_rows * (mp._single_node_mode ? 1 : H2O.CLOUD.size())) && !mp._shuffle_training_data && H2O.CLOUD.size() > 1 && !mp._reproducible) { if (!mp._quiet_mode) Log.info( "Enabling training data shuffling, because all nodes train on the full dataset (replicated training data)."); mp._shuffle_training_data = true; } if (!mp._shuffle_training_data && model.actual_train_samples_per_iteration == model.training_rows && train.anyVec().nChunks() == 1) { if (!mp._quiet_mode) Log.info( "Enabling training data shuffling to avoid training rows in the same order over and over (no Hogwild since there's only 1 chunk)."); mp._shuffle_training_data = true; } // if (!mp._quiet_mode) Log.info("Initial model:\n" + model.model_info()); long now = System.currentTimeMillis(); model._timeLastIterationEnter = now; if (_parms._autoencoder) { _job.update(0, "Scoring null model of autoencoder..."); if (!mp._quiet_mode) Log.info("Scoring the null model of the autoencoder."); model.doScoring( trainScoreFrame, validScoreFrame, _job._key, 0, false); // get the null model reconstruction error } // put the initial version of the model into DKV model.update(_job); model.total_setup_time_ms += now - _job.start_time(); Log.info("Total setup time: " + PrettyPrint.msecs(model.total_setup_time_ms, true)); Log.info("Starting to train the Deep Learning model."); _job.update(0, "Training..."); // main loop for (; ; ) { model.iterations++; model.set_model_info( mp._epochs == 0 ? model.model_info() : H2O.CLOUD.size() > 1 && mp._replicate_training_data ? (mp._single_node_mode ? new DeepLearningTask2( _job._key, train, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAll(Key.make(H2O.SELF)) .model_info() : // replicated data + single node mode new DeepLearningTask2( _job._key, train, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAllNodes() .model_info()) : // replicated data + multi-node mode new DeepLearningTask( _job._key, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAll(train) .model_info()); // distributed data (always in multi-node mode) if (stop_requested() && !timeout()) break; // cancellation if (!model.doScoring( trainScoreFrame, validScoreFrame, _job._key, model.iterations, false)) break; // finished training (or early stopping or convergence) if (timeout()) break; // stop after scoring } // replace the model with the best model so far (if it's better) if (!stop_requested() && _parms._overwrite_with_best_model && model.actual_best_model_key != null && _parms._nfolds == 0) { DeepLearningModel best_model = DKV.getGet(model.actual_best_model_key); if (best_model != null && best_model.loss() < model.loss() && Arrays.equals(best_model.model_info().units, model.model_info().units)) { if (!_parms._quiet_mode) Log.info("Setting the model to be the best model so far (based on scoring history)."); DeepLearningModelInfo mi = best_model.model_info().deep_clone(); // Don't cheat - count full amount of training samples, since that's the amount of // training it took to train (without finding anything better) mi.set_processed_global(model.model_info().get_processed_global()); mi.set_processed_local(model.model_info().get_processed_local()); model.set_model_info(mi); model.update(_job); model.doScoring(trainScoreFrame, validScoreFrame, _job._key, model.iterations, true); assert (best_model.loss() == model.loss()); } } // store coefficient names for future use // possibly change model.model_info().data_info().coefNames(); if (!_parms._quiet_mode) { Log.info( "=============================================================================================================================================================================="); if (stop_requested()) { Log.info("Deep Learning model training was interrupted."); } else { Log.info("Finished training the Deep Learning model."); Log.info(model); } Log.info( "=============================================================================================================================================================================="); } } finally { if (model != null) { model.deleteElasticAverageModels(); model.unlock(_job); if (model.actual_best_model_key != null) { assert (model.actual_best_model_key != model._key); DKV.remove(model.actual_best_model_key); } } } return model; }
@Test public void testExpandCatsProstate() throws InterruptedException, ExecutionException { double[][] prostate = ard( ard(0, 71, 1, 0, 0, 4.8, 14.0, 7), ard(1, 70, 1, 1, 0, 8.4, 21.8, 5), ard(0, 73, 1, 3, 0, 10.0, 27.4, 6), ard(1, 68, 1, 0, 0, 6.7, 16.7, 6)); double[][] pros_expandR = ard( ard(1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 71, 4.8, 14.0, 7), ard(0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 70, 8.4, 21.8, 5), ard(0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 73, 10.0, 27.4, 6), ard(1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 68, 6.7, 16.7, 6)); String[] pros_cols = new String[] {"Capsule", "Age", "Race", "Dpros", "Dcaps", "PSA", "Vol", "Gleason"}; String[][] pros_domains = new String[][] { new String[] {"No", "Yes"}, null, new String[] {"Other", "White", "Black"}, new String[] {"None", "UniLeft", "UniRight", "Bilobar"}, new String[] {"No", "Yes"}, null, null, null }; final int[] cats = new int[] {1, 3, 4, 5}; // Categoricals: CAPSULE, RACE, DPROS, DCAPS Frame fr = null; try { Scope.enter(); fr = parse_test_file(Key.make("prostate.hex"), "smalldata/logreg/prostate.csv"); for (int i = 0; i < cats.length; i++) Scope.track(fr.replace(cats[i], fr.vec(cats[i]).toCategoricalVec())._key); fr.remove("ID").remove(); DKV.put(fr._key, fr); DataInfo dinfo = new DataInfo( Key.make(), fr, null, 0, true, DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, false, false, false, /* weights */ false, /* offset */ false, /* fold */ false); Log.info("Original matrix:\n" + colFormat(pros_cols, "%8.7s") + ArrayUtils.pprint(prostate)); double[][] pros_perm = ArrayUtils.permuteCols(prostate, dinfo._permutation); Log.info( "Permuted matrix:\n" + colFormat(pros_cols, "%8.7s", dinfo._permutation) + ArrayUtils.pprint(pros_perm)); double[][] pros_exp = GLRM.expandCats(pros_perm, dinfo); Log.info( "Expanded matrix:\n" + colExpFormat(pros_cols, pros_domains, "%8.7s", dinfo._permutation) + ArrayUtils.pprint(pros_exp)); Assert.assertArrayEquals(pros_expandR, pros_exp); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } finally { if (fr != null) fr.delete(); Scope.exit(); } }
public void validate(GLM glm) { if (_compute_p_values && _solver != Solver.AUTO && _solver != Solver.IRLSM) glm.error( "_compute_p_values", "P values can only be computed with IRLSM solver, go solver = " + _solver); if (_compute_p_values && (_lambda == null || _lambda[0] > 0)) glm.error( "_compute_p_values", "P values can only be computed with NO REGULARIZATION (lambda = 0)"); if (_compute_p_values && _family == Family.multinomial) glm.error( "_compute_p_values", "P values are currently not supported for family=multinomial"); if (_compute_p_values && _non_negative) glm.error( "_compute_p_values", "P values are currently not supported for family=multinomial"); if (_weights_column != null && _offset_column != null && _weights_column.equals(_offset_column)) glm.error("_offset_column", "Offset must be different from weights"); if (_lambda_search) if (glm.nFoldCV()) glm.error( "_lambda_search", "Lambda search is not currently supported in conjunction with N-fold cross-validation"); if (_nlambdas == -1) _nlambdas = 100; else _exactLambdas = false; if (_obj_reg != -1 && _obj_reg <= 0) glm.error("obj_reg", "Must be positive or -1 for default"); if (_prior != -1 && _prior <= 0 || _prior >= 1) glm.error("_prior", "Prior must be in (exlusive) range (0,1)"); if (_family != Family.tweedie) { glm.hide("_tweedie_variance_power", "Only applicable with Tweedie family"); glm.hide("_tweedie_link_power", "Only applicable with Tweedie family"); } if (_beta_constraints != null) { if (_family == Family.multinomial) glm.error( "beta_constraints", "beta constraints are not supported for family = multionomial"); Frame f = _beta_constraints.get(); if (f == null) glm.error("beta_constraints", "Missing frame for beta constraints"); Vec v = f.vec("names"); if (v == null) glm.error( "beta_constraints", "Beta constraints parameter must have names column with valid coefficient names"); // todo: check the coefficient names v = f.vec("upper_bounds"); if (v != null && !v.isNumeric()) glm.error("beta_constraints", "upper_bounds must be numeric if present"); v = f.vec("upper_bounds"); v = f.vec("lower_bounds"); if (v != null && !v.isNumeric()) glm.error("beta_constraints", "lower_bounds must be numeric if present"); v = f.vec("beta_given"); if (v != null && !v.isNumeric()) glm.error("beta_constraints", "beta_given must be numeric if present"); v = f.vec("upper_bounds"); v = f.vec("beta_start"); if (v != null && !v.isNumeric()) glm.error("beta_constraints", "beta_start must be numeric if present"); } if (!_lambda_search) { glm.hide("_lambda_min_ratio", "only applies if lambda search is on."); glm.hide("_nlambdas", "only applies if lambda search is on."); } if (_link != Link.family_default) { // check we have compatible link switch (_family) { case gaussian: if (_link != Link.identity && _link != Link.log && _link != Link.inverse) throw new IllegalArgumentException( "Incompatible link function for selected family. Only identity, log and inverse links are allowed for family=gaussian."); break; case binomial: if (_link != Link .logit) // fixme: R also allows log, but it's not clear when can be applied and // what should we do in case the predictions are outside of 0/1. throw new IllegalArgumentException( "Incompatible link function for selected family. Only logit is allowed for family=binomial. Got " + _link); break; case poisson: if (_link != Link.log && _link != Link.identity) throw new IllegalArgumentException( "Incompatible link function for selected family. Only log and identity links are allowed for family=poisson."); break; case gamma: if (_link != Link.inverse && _link != Link.log && _link != Link.identity) throw new IllegalArgumentException( "Incompatible link function for selected family. Only inverse, log and identity links are allowed for family=gamma."); break; case tweedie: if (_link != Link.tweedie) throw new IllegalArgumentException( "Incompatible link function for selected family. Only tweedie link allowed for family=tweedie."); break; case multinomial: if (_link != Link.multinomial) throw new IllegalArgumentException( "Incompatible link function for selected family. Only multinomial link allowed for family=multinomial."); break; default: H2O.fail(); } } }
private void chkFr(Frame fr, int col, int row, double exp, double tol) { if (Double.isNaN(exp)) Assert.assertTrue(fr.vec(col).isNA(row)); else Assert.assertEquals(exp, fr.vec(col).at(row), tol); }