@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Val v = stk.track(asts[1].exec(env)); if (v instanceof ValRow) { ValRow vv = (ValRow) v; return vv.slice(asts[2].columns(vv._names)); } Frame fr = v.getFrame(); int[] cols = asts[2].columns(fr.names()); Frame fr2 = new Frame(); if (cols.length == 0) { // Empty inclusion list? } else if (cols[0] >= 0) { // Positive (inclusion) list if (cols[cols.length - 1] > fr.numCols()) throw new IllegalArgumentException( "Column must be an integer from 0 to " + (fr.numCols() - 1)); for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]); } else { // Negative (exclusion) list fr2 = new Frame(fr); // All of them at first Arrays.sort(cols); // This loop depends on the values in sorted order for (int col : cols) if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column } return new ValFrame(fr2); }
@Override public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) { Frame f = stk.track(asts[1].exec(env)).getFrame(); AstRoot axisAR = asts[2]; for (Vec v : f.vecs()) { if (v.isCategorical() || v.isString() || v.isUUID()) throw new IllegalArgumentException( "Cumulative functions not applicable to enum, string, or UUID values"); } double axis = axisAR.exec(env).getNum(); if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1"); if (f.numCols() == 1) { if (axis == 0.0) { AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init()); t.doAll(new byte[] {Vec.T_NUM}, f.anyVec()); final double[] chkCumu = t._chkCumu; Vec cumuVec = t.outputFrame().anyVec(); new MRTask() { @Override public void map(Chunk c) { if (c.cidx() != 0) { double d = chkCumu[c.cidx() - 1]; for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d)); } } }.doAll(cumuVec); return new ValFrame(new Frame(cumuVec)); } else { return new ValFrame(new Frame(f)); } } else { if (axis == 0.0) { // down the column implementation AstCumu.CumuTaskWholeFrame t = new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols()); Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null); final double[][] chkCumu = t._chkCumu; new MRTask() { @Override public void map(Chunk cs[]) { if (cs[0].cidx() != 0) { for (int i = 0; i < cs.length; i++) { double d = chkCumu[i][cs[i].cidx() - 1]; for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d)); } } } }.doAll(fr2); return new ValFrame(new Frame(fr2)); } else { AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init()); Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null); return new ValFrame(new Frame(fr2)); } } }
@Test public void testDomains() { Frame frame = parse_test_file("smalldata/junit/weather.csv"); for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) { Vec v = frame.vec(s); Vec newV = v.toCategoricalVec(); frame.remove(s); frame.add(s, newV); v.remove(); } DKV.put(frame); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 10; AggregatorModel agg = new Aggregator(parms).trainModel().get(); Frame output = agg._output._output_frame.get(); Assert.assertTrue(output.numRows() < 0.5 * frame.numRows()); boolean same = true; for (int i = 0; i < frame.numCols(); ++i) { if (frame.vec(i).isCategorical()) { same = (frame.domains()[i].length == output.domains()[i].length); if (!same) break; } } frame.remove(); output.remove(); agg.remove(); Assert.assertFalse(same); }
private static void assertRowFrameEquals(double[] expected, Frame actual) { assertEquals(1, actual.numRows()); assertEquals(expected.length, actual.numCols()); for (int i = 0; i < expected.length; i++) { assertEquals("Wrong sum in column " + actual.name(i), expected[i], actual.vec(i).at(0), 1e-8); } }
/** * Project each archetype into original feature space * * @param frame Original training data with m rows and n columns * @param destination_key Frame Id for output * @return Frame containing k rows and n columns, where each row corresponds to an archetype */ public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) { final int ncols = _output._names.length; Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); String[][] adaptedDomme = adaptedFr.domains(); double[][] proj = new double[_parms._k][_output._nnums + _output._ncats]; // Categorical columns for (int d = 0; d < _output._ncats; d++) { double[][] block = _output._archetypes_raw.getCatBlock(d); for (int k = 0; k < _parms._k; k++) proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]); } // Numeric columns for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) { int ds = d - _output._ncats; for (int k = 0; k < _parms._k; k++) { double num = _output._archetypes_raw.getNum(ds, k); proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]); if (reverse_transform) proj[k][_output._permutation[d]] = proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds]; } } // Convert projection of archetypes into a frame with correct domains Frame f = ArrayUtils.frame( (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj); for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]); return f; }
private static void assertColFrameEquals(double[] expected, Frame actual) { assertEquals(1, actual.numCols()); assertEquals(expected.length, actual.numRows()); for (int i = 0; i < expected.length; i++) { assertEquals("Wrong sum in row " + i, expected[i], actual.vec(0).at(i), 1e-8); } }
@Override protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) { Frame adaptFrm = new Frame(adaptedFr); for (int i = 0; i < _parms._k; i++) adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero()); new MRTask() { @Override public void map(Chunk chks[]) { double tmp[] = new double[_output._names.length]; double preds[] = new double[_parms._k]; for (int row = 0; row < chks[0]._len; row++) { double p[] = score0(chks, row, tmp, preds); for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]); } } }.doAll(adaptFrm); // Return the projection into principal component space int x = _output._names.length, y = adaptFrm.numCols(); Frame f = adaptFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame( (null == destination_key ? Key.make() : Key.make(destination_key)), f.names(), f.vecs()); DKV.put(f); makeMetricBuilder(null).makeModelMetrics(this, orig); return f; }
/** * Sample rows from a frame. Can be unlucky for small sampling fractions - will continue calling * itself until at least 1 row is returned. * * @param fr Input frame * @param rows Approximate number of rows to sample (across all chunks) * @param seed Seed for RNG * @return Sampled frame */ public static Frame sampleFrame(Frame fr, final long rows, final long seed) { if (fr == null) return null; final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f; if (fraction >= 1.f) return fr; Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { final Random rng = getDeterRNG(seed + cs[0].cidx()); int count = 0; for (int r = 0; r < cs[0]._len; r++) if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) { count++; for (int i = 0; i < ncs.length; i++) { ncs[i].addNum(cs[i].at0(r)); } } } }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains()); if (r.numRows() == 0) { Log.warn( "You asked for " + rows + " rows (out of " + fr.numRows() + "), but you got none (seed=" + seed + ")."); Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\""); return sampleFrame(fr, rows, seed + 1); } return r; }
/** * Annotate the number of columns and rows of the training data set in the job parameter JSON * @return JsonObject annotated with num_cols and num_rows of the training data set */ @Override protected JsonObject toJSON() { JsonObject jo = super.toJSON(); if (source != null) { jo.getAsJsonObject("source").addProperty("num_cols", source.numCols()); jo.getAsJsonObject("source").addProperty("num_rows", source.numRows()); } return jo; }
/** * Annotate the number of columns and rows of the validation data set in the job parameter JSON * @return JsonObject annotated with num_cols and num_rows of the validation data set */ @Override protected JsonObject toJSON() { JsonObject jo = super.toJSON(); if (validation != null) { jo.getAsJsonObject("validation").addProperty("num_cols", validation.numCols()); jo.getAsJsonObject("validation").addProperty("num_rows", validation.numRows()); } return jo; }
@Test public void testColumnwisesumOnEmptyFrame() { Frame fr = register(new Frame(Key.<Frame>make())); Val val = Rapids.exec("(sumaxis " + fr._key + " 0 0)"); assertTrue(val instanceof ValFrame); Frame res = register(val.getFrame()); assertEquals(res.numCols(), 0); assertEquals(res.numRows(), 0); }
/** * This method applies a stacked autoencoders model to a given dataset and make predictions * * @param ctxt JavaSparkContext * @param deeplearningModel Stacked Autoencoders model * @param test Testing dataset as a JavaRDD of labeled points * @return */ public JavaPairRDD<Double, Double> test( JavaSparkContext ctxt, final DeepLearningModel deeplearningModel, JavaRDD<LabeledPoint> test, MLModel mlModel) throws MLModelBuilderException { Scope.enter(); if (deeplearningModel == null) { throw new MLModelBuilderException("DeeplearningModel is Null"); } int numberOfFeatures = mlModel.getFeatures().size(); List<Feature> features = mlModel.getFeatures(); String[] names = new String[numberOfFeatures + 1]; for (int i = 0; i < numberOfFeatures; i++) { names[i] = features.get(i).getName(); } names[numberOfFeatures] = mlModel.getResponseVariable(); Frame testData = DeeplearningModelUtils.javaRDDToFrame(names, test); Frame testDataWithoutLabels = testData.subframe(0, testData.numCols() - 1); int numRows = (int) testDataWithoutLabels.numRows(); Vec predictionsVector = deeplearningModel.score(testDataWithoutLabels).vec(0); double[] predictionValues = new double[numRows]; for (int i = 0; i < numRows; i++) { predictionValues[i] = predictionsVector.at(i); } Vec labelsVector = testData.vec(testData.numCols() - 1); double[] labels = new double[numRows]; for (int i = 0; i < numRows; i++) { labels[i] = labelsVector.at(i); } Scope.exit(); ArrayList<Tuple2<Double, Double>> tupleList = new ArrayList<Tuple2<Double, Double>>(); for (int i = 0; i < labels.length; i++) { tupleList.add(new Tuple2<Double, Double>(predictionValues[i], labels[i])); } return ctxt.parallelizePairs(tupleList); }
// private constructor called by filterExpandedColumns private DataInfo( DataInfo dinfo, Frame fr, double[] normMul, double[] normSub, int[][] catLevels, int[] catModes) { _fullCatOffsets = dinfo._catOffsets; if (!dinfo._useAllFactorLevels) { _fullCatOffsets = dinfo._catOffsets.clone(); for (int i = 0; i < _fullCatOffsets.length; ++i) _fullCatOffsets[i] += i; // add for the skipped zeros. } _offset = dinfo._offset; _weights = dinfo._weights; _fold = dinfo._fold; _valid = false; _interactions = dinfo._interactions; _interactionVecs = dinfo._interactionVecs; assert dinfo._predictor_transform != null; assert dinfo._response_transform != null; _predictor_transform = dinfo._predictor_transform; _response_transform = dinfo._response_transform; _skipMissing = dinfo._skipMissing; _imputeMissing = dinfo._imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new boolean[catLevels.length]; Arrays.fill(_catMissing, !(dinfo._imputeMissing || dinfo._skipMissing)); int s = 0; for (int i = 0; i < catLevels.length; ++i) { _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length - 1] = s; _responses = dinfo._responses; _cats = catLevels.length; _nums = fr.numCols() - _cats - dinfo._responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0); _numOffsets = _nums == 0 ? new int[0] : dinfo._numOffsets.clone(); int diff = _numOffsets.length > 0 ? _numOffsets[0] - s : 0; for (int i = 0; i < _numOffsets.length; i++) // need to shift everyone down by the offset! _numOffsets[i] -= diff; _useAllFactorLevels = true; // dinfo._useAllFactorLevels; _numMeans = new double[_nums]; _normMul = normMul; _normSub = normSub; _catModes = catModes; for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean(); }
public void setResponseTransform(TransformType t) { _response_transform = t; if (t == TransformType.NONE) { _normRespMul = null; _normRespSub = null; } else { _normRespMul = MemoryManager.malloc8d(_responses); _normRespSub = MemoryManager.malloc8d(_responses); setTransform(t, _normRespMul, _normRespSub, _adaptedFrame.numCols() - _responses, _responses); } }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); if (fr.numCols() == 1 && fr.numRows() == 1) { if (fr.anyVec().isNumeric() || fr.anyVec().isBad()) return new ValNum(fr.anyVec().at(0)); else if (fr.anyVec().isString()) return new ValStr(fr.anyVec().atStr(new BufferedString(), 0).toString()); return new ValStr(fr.domains()[0][(int) fr.anyVec().at8(0)]); } return new ValFrame(fr); // did not flatten }
@Override ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); double frac = asts[2].exec(env).getNum(); double nrow = fr.numRows() * frac; Vec vecs[] = fr.vecs(); long[] idxs = new long[fr.numCols()]; int j = 0; for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i; Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec()); return new ValFrame(new Frame(vec)); }
// Make vector templates for all output frame vectors private Vec[][] makeTemplates(Frame dataset, float[] ratios) { Vec anyVec = dataset.anyVec(); final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios); final int num = dataset.numCols(); // number of columns in input frame final int nsplits = espcPerSplit.length; // number of splits final String[][] domains = dataset.domains(); // domains Vec[][] t = new Vec[nsplits][ /*num*/]; // resulting vectors for all for (int i = 0; i < nsplits; i++) { // vectors for j-th split t[i] = new Vec(Vec.newKey(), espcPerSplit[i /*-th split*/]).makeZeros(num, domains); } return t; }
public static Frame expandDataset(Frame fr, Key destkey) { // , int[] ignored) { ArrayList<Vec> nvecs = new ArrayList<Vec>(); ArrayList<Vec> evecs = new ArrayList<Vec>(); ArrayList<String> eNames = new ArrayList<String>(); ArrayList<String> nNames = new ArrayList<String>(); int[] offsets = new int[fr.numCols() + 1]; Vec[] vecs = fr.vecs(); int c = 0; // int ip = 0; //ignored pointer for (int i = 0; i < fr.numCols(); i++) { if (vecs[i] .isEnum()) { // && i != ignored[ip]) {//!fr._names {//_names[i]. { //equals(ignored)) { offsets[evecs.size()] = c; evecs.add(vecs[i]); String name = fr._names[i]; c += vecs[i]._domain.length; for (String s : vecs[i]._domain) eNames.add(name + "." + s); } else { // if(i == ignored[ip] && ip < ignored.length - 1) ip++; nvecs.add(vecs[i]); nNames.add(fr._names[i]); } } offsets[evecs.size()] = c; if (evecs.isEmpty()) return fr; offsets = Arrays.copyOf(offsets, evecs.size() + 1); OneHot ss = new OneHot(); ss._offsets = offsets; int l = offsets[evecs.size()]; ss.doAll(l, evecs.toArray(new Vec[evecs.size()])); Frame fr2 = ss.outputFrame(destkey, eNames.toArray(new String[eNames.size()]), new String[l][]); fr2.add( new Frame(nNames.toArray(new String[nNames.size()]), nvecs.toArray(new Vec[nvecs.size()])), false); return fr2; }
// GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell // (2015), Section 5.3) private Frame reconstruct( Frame orig, Frame adaptedFr, Key destination_key, boolean save_imputed, boolean reverse_transform) { final int ncols = _output._names.length; assert ncols == adaptedFr.numCols(); String prefix = "reconstr_"; // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame // Note: A is adapted to original training frame, P has columns shuffled so cats come before // nums! Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); String[][] adaptedDomme = adaptedFr.domains(); for (int i = 0; i < ncols; i++) { Vec v = fullFrm.anyVec().makeZero(); v.setDomain(adaptedDomme[i]); fullFrm.add(prefix + _output._names[i], v); } GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm); // Return the imputed training frame int x = ncols + _parms._k, y = fullFrm.numCols(); Frame f = fullFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs()); DKV.put(f); gs._mb.makeModelMetrics( GLRMModel.this, orig, null, null); // save error metrics based on imputed data return f; }
// Scalar covariance for 1 row private ValNum scalar(Frame frx, Frame fry, Mode mode) { if (frx.numCols() != fry.numCols()) throw new IllegalArgumentException( "Single rows must have the same number of columns, found " + frx.numCols() + " and " + fry.numCols()); Vec vecxs[] = frx.vecs(); Vec vecys[] = fry.vecs(); double xmean = 0, ymean = 0, ncols = frx.numCols(), NACount = 0, xval, yval, ss = 0; for (int r = 0; r < ncols; r++) { xval = vecxs[r].at(0); yval = vecys[r].at(0); if (Double.isNaN(xval) || Double.isNaN(yval)) NACount++; else { xmean += xval; ymean += yval; } } xmean /= (ncols - NACount); ymean /= (ncols - NACount); if (NACount != 0) { if (mode.equals(Mode.AllObs)) throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present"); if (mode.equals(Mode.Everything)) return new ValNum(Double.NaN); } for (int r = 0; r < ncols; r++) { xval = vecxs[r].at(0); yval = vecys[r].at(0); if (!(Double.isNaN(xval) || Double.isNaN(yval))) ss += (vecxs[r].at(0) - xmean) * (vecys[r].at(0) - ymean); } return new ValNum(ss / (ncols - NACount - 1)); }
// private constructor called by filterExpandedColumns private DataInfo( Key<DataInfo> selfKey, Frame fr, double[] normMul, double[] normSub, int[][] catLevels, int responses, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean weight, boolean offset, boolean fold) { super(selfKey); _offset = offset; _weights = weight; _fold = fold; _valid = false; assert predictor_transform != null; assert response_transform != null; _predictor_transform = predictor_transform; _response_transform = response_transform; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new int[catLevels.length]; int s = 0; for (int i = 0; i < catLevels.length; ++i) { _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length - 1] = s; _responses = responses; _cats = catLevels.length; _nums = fr.numCols() - _cats - responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0); _useAllFactorLevels = true; _catModes = new int[_cats]; _numMeans = new double[_nums]; _normMul = normMul; _normSub = normSub; for (int i = 0; i < _cats; i++) _catModes[i] = imputeCat(_adaptedFrame.vec(i)); for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean(); }
public static Frame shuffleFramePerChunk(Key outputFrameKey, Frame fr, final long seed) { Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { long[] idx = new long[cs[0]._len]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); for (int r = 0; r < idx.length; ++r) { for (int i = 0; i < ncs.length; i++) { ncs[i].addNum(cs[i].at0((int) idx[r])); } } } }.doAll(fr.numCols(), fr).outputFrame(outputFrameKey, fr.names(), fr.domains()); return r; }
@Test public void testQuantile() { Frame f = null; try { Frame fr = frame( ard( ard(1.223292e-02), ard(1.635312e-25), ard(1.601522e-11), ard(8.452298e-10), ard(2.643733e-10), ard(2.671520e-06), ard(1.165381e-06), ard(7.193265e-10), ard(3.383532e-04), ard(2.561221e-05))); double[] probs = new double[] {0.001, 0.005, .01, .02, .05, .10, .50, .8883, .90, .99}; String x = String.format("(quantile %%%s %s \"interpolate\")", fr._key, Arrays.toString(probs)); Val val = Exec.exec(x); fr.delete(); f = val.getFrame(); Assert.assertEquals(2, f.numCols()); // Expected values computed as golden values from R's quantile call double[] exp = ard( 1.4413698000016206E-13, 7.206849000001562E-13, 1.4413698000001489E-12, 2.882739600000134E-12, 7.20684900000009E-12, 1.4413698000000017E-11, 5.831131148999999E-07, 3.3669567275300000E-04, 0.00152780988, 0.011162408988); for (int i = 0; i < exp.length; i++) Assert.assertTrue( "expected " + exp[i] + " got " + f.vec(1).at(i), water.util.MathUtils.compare(exp[i], f.vec(1).at(i), 1e-6, 1e-6)); } finally { if (f != null) f.delete(); } }
// ========================================================================== public void basicGBM(String fname, String hexname, PrepData prep) { File file = TestUtil.find_test_file(fname); if (file == null) return; // Silently abort test if the file is missing Key fkey = NFSFileVec.make(file); Key dest = Key.make(hexname); GBM gbm = null; Frame fr = null; try { gbm = new GBM(); gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey}); UKV.remove(fkey); int idx = prep.prep(fr); if (idx < 0) { gbm.classification = false; idx = ~idx; } String rname = fr._names[idx]; gbm.response = fr.vecs()[idx]; fr.remove(idx); // Move response to the end fr.add(rname, gbm.response); gbm.ntrees = 4; gbm.max_depth = 4; gbm.min_rows = 1; gbm.nbins = 50; gbm.cols = new int[fr.numCols()]; for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i; gbm.learn_rate = .2f; gbm.invoke(); fr = gbm.score(gbm.source); GBM.GBMModel gbmmodel = UKV.get(gbm.dest()); // System.out.println(gbmmodel.toJava()); } finally { UKV.remove(dest); // Remove original hex frame key if (gbm != null) { UKV.remove(gbm.dest()); // Remove the model UKV.remove(gbm.response._key); gbm.remove(); // Remove GBM Job if (fr != null) fr.remove(); } } }
private void applyTrainingFrameSideEffects() { int numCols = _modelBuilderTrain.numCols(); String responseVecName = _modelBuilderTrain.names()[numCols - 1]; Vec responseVec = _modelBuilderTrain.remove(numCols - 1); final boolean use_weights_column = (_parms.weights_column != null); final boolean use_start_column = (_parms.start_column != null); if (use_weights_column) { Vec weightsVec = _parms.weights_column; int idxInRawFrame = _train.find(weightsVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find weightVec"); } String weightsVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(weightsVecName, weightsVec); } if (use_start_column) { Vec startVec = _parms.start_column; int idxInRawFrame = _train.find(startVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find startVec"); } String startVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(startVecName, startVec); } { Vec stopVec = _parms.stop_column; int idxInRawFrame = _train.find(stopVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find stopVec"); } String stopVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(stopVecName, stopVec); } _modelBuilderTrain.add(responseVecName, responseVec); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { QuantileModel.QuantileParameters parms = new QuantileModel.QuantileParameters(); Frame fr = stk.track(asts[1].exec(env)).getFrame(); Frame fr_wkey = new Frame(fr); // Force a bogus Key for Quantiles ModelBuilder DKV.put(fr_wkey); parms._train = fr_wkey._key; parms._probs = ((ASTNumList) asts[2]).expand(); for (double d : parms._probs) if (d < 0 || d > 1) throw new IllegalArgumentException("Probability must be between 0 and 1: " + d); String inter = asts[3].exec(env).getStr(); parms._combine_method = QuantileModel.CombineMethod.valueOf(inter.toUpperCase()); parms._weights_column = asts[4].str().equals("_") ? null : asts[4].str(); // Compute Quantiles QuantileModel q = new Quantile(parms).trainModel().get(); // Remove bogus Key DKV.remove(fr_wkey._key); // Reshape all outputs as a Frame, with probs in col 0 and the // quantiles in cols 1 thru fr.numCols() - except the optional weights vec int ncols = fr.numCols(); if (parms._weights_column != null) ncols--; Vec[] vecs = new Vec[1 /*1 more for the probs themselves*/ + ncols]; String[] names = new String[vecs.length]; vecs[0] = Vec.makeCon(null, parms._probs); names[0] = "Probs"; int w = 0; for (int i = 0; i < vecs.length - 1; ++i) { if (fr._names[i].equals(parms._weights_column)) w = 1; assert (w == 0 || w == 1); vecs[i + 1] = Vec.makeCon(null, q._output._quantiles[i]); names[i + 1] = fr._names[w + i] + "Quantiles"; } q.delete(); return new ValFrame(new Frame(names, vecs)); }
public ModelMetricsGLRM scoreMetricsOnly(Frame frame) { final int ncols = _output._names.length; // Need [A,X] where A = adapted test frame, X = loading frame // Note: A is adapted to original training frame Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); // Append loading frame X for calculating XY Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm); ModelMetrics mm = gs._mb.makeModelMetrics( GLRMModel.this, adaptedFr, null, null); // save error metrics based on imputed data return (ModelMetricsGLRM) mm; }
/** * Perform A'x operation with a DRM and an in-core Vector to create a new DRM. * * @param drmA DRM representing matrix A. * @param x in-core Mahout Vector. * @return new DRM containing A'x. */ public static H2ODrm exec(H2ODrm drmA, Vector x) { Frame A = drmA.frame; final H2OBCast<Vector> bx = new H2OBCast<Vector>(x); // A'x is computed into atx[] with an MRTask on A (with // x available as a Broadcast // // x.size() == A.numRows() // atx.length == chks.length == A.numCols() class MRTaskAtx extends MRTask<MRTaskAtx> { double atx[]; public void map(Chunk chks[]) { int chunkSize = chks[0].len(); Vector x = bx.value(); long start = chks[0].start(); atx = new double[chks.length]; for (int r = 0; r < chunkSize; r++) { double d = x.getQuick((int) start + r); for (int c = 0; c < chks.length; c++) { atx[c] += (chks[c].at0(r) * d); } } } public void reduce(MRTaskAtx other) { ArrayUtils.add(atx, other.atx); } } // Take the result in .atx[], and convert into a Frame // using existing helper functions (creating a Matrix // along the way for the Helper) Vector v = new DenseVector(new MRTaskAtx().doAll(A).atx); Matrix m = new DenseMatrix(A.numCols(), 1); m.assignColumn(0, v); return H2OHelper.drmFromMatrix(m, -1, -1); }
/** * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This * call is expected to be overridden in the subclasses and each subclass will start with * "super.init();". * * <p>Validate K, max_iterations and the number of rows. */ @Override public void init(boolean expensive) { super.init(expensive); if (_parms._max_iterations < 0 || _parms._max_iterations > 1e6) error("_max_iterations", " max_iterations must be between 0 and 1e6"); if (_train == null) return; if (_parms._init == Initialization.User && _parms._user_points == null) error("_user_y", "Must specify initial cluster centers"); if (null != _parms._user_points) { // Check dimensions of user-specified centers Frame user_points = _parms._user_points.get(); if (user_points.numCols() != _train.numCols() - numSpecialCols()) { error( "_user_y", "The user-specified points must have the same number of columns (" + (_train.numCols() - numSpecialCols()) + ") as the training observations"); } else if (user_points.numRows() != _parms._k) error( "_user_y", "The number of rows in the user-specified points is not equal to k = " + _parms._k); } if (expensive && error_count() == 0) checkMemoryFootPrint(); }
/** * Compute the L2 norm for each row of the frame * * @param fr Input frame * @return Vec containing L2 values for each row, is in K-V store */ public static Vec getL2(final Frame fr, final double[] scale) { // add workspace vec at end final int idx = fr.numCols(); assert (scale.length == idx) : "Mismatch for number of columns"; fr.add("L2", fr.anyVec().makeZero()); Vec res; try { new MRTask2() { @Override public void map(Chunk[] cs) { for (int r = 0; r < cs[0]._len; r++) { double norm2 = 0; for (int i = 0; i < idx; i++) norm2 += Math.pow(cs[i].at0(r) * scale[i], 2); cs[idx].set0(r, Math.sqrt(norm2)); } } }.doAll(fr); } finally { res = fr.remove(idx); } res.rollupStats(); return res; }