/** * Delete v1, v2 after potential modifying operations during processing: enums and/or train/test * adaptation. */ private void simpleCMTest( Frame v1, Frame v2, String[] actualDomain, String[] predictedDomain, String[] expectedDomain, double[][] expectedCM, boolean debug, boolean toEnum) { Scope.enter(); try { ConfusionMatrix cm = buildCM(v1.vecs()[0].toEnum(), v2.vecs()[0].toEnum()); // -- DEBUG -- if (debug) { System.err.println("actual : " + Arrays.toString(actualDomain)); System.err.println("predicted : " + Arrays.toString(predictedDomain)); System.err.println("CM domain : " + Arrays.toString(cm._domain)); System.err.println("expected CM domain: " + Arrays.toString(expectedDomain) + "\n"); for (int i = 0; i < cm._cm.length; i++) System.err.println(Arrays.toString(cm._cm[i])); System.err.println(""); System.err.println(cm.toASCII()); } // -- -- -- assertCMEqual(expectedDomain, expectedCM, cm); } finally { if (v1 != null) v1.delete(); if (v2 != null) v2.delete(); Scope.exit(); } }
DataAdapter( Frame fr, SpeeDRFModel model, int[] modelDataMap, int rows, long unique, long seed, int binLimit, double[] classWt) { // assert model._dataKey == fr._key; _seed = seed + (unique << 16); // This is important to preserve sampling selection!!! /* Maximum arity for a column (not a hard limit) */ _numRows = rows; _numClasses = model.regression ? 1 : model.classes(); _regression = model.regression; _c = new Col[model.fr.numCols()]; for (int i = 0; i < _c.length; i++) { assert fr._names[modelDataMap[i]].equals(model.fr._names[i]); Vec v = fr.vecs()[i]; if (isByteCol(v, rows, i == _c.length - 1, _regression)) // we do not bin for small values _c[i] = new Col(fr._names[i], rows, i == _c.length - 1); else _c[i] = new Col(fr._names[i], rows, i == _c.length - 1, binLimit, !(v.isEnum() || v.isInt())); } boolean trivial = true; if (classWt != null) for (double f : classWt) if (f != 1.0) trivial = false; _classWt = trivial ? null : classWt; }
@Override public void compute2() { _in.read_lock(_jobKey); // simply create a bogus new vector (don't even put it into KV) with appropriate number of lines // per chunk and then use it as a source to do multiple makeZero calls // to create empty vecs and than call RebalanceTask on each one of them. // RebalanceTask will fetch the appropriate src chunks and fetch the data from them. int rpc = (int) (_in.numRows() / _nchunks); int rem = (int) (_in.numRows() % _nchunks); long[] espc = new long[_nchunks + 1]; Arrays.fill(espc, rpc); for (int i = 0; i < rem; ++i) ++espc[i]; long sum = 0; for (int i = 0; i < espc.length; ++i) { long s = espc[i]; espc[i] = sum; sum += s; } assert espc[espc.length - 1] == _in.numRows() : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1]; final Vec[] srcVecs = _in.vecs(); _out = new Frame( _okey, _in.names(), new Vec(Vec.newKey(), espc).makeZeros(srcVecs.length, _in.domains())); _out.delete_and_lock(_jobKey); new RebalanceTask(this, srcVecs).asyncExec(_out); }
@Override protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) { Frame adaptFrm = new Frame(adaptedFr); for (int i = 0; i < _parms._k; i++) adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero()); new MRTask() { @Override public void map(Chunk chks[]) { double tmp[] = new double[_output._names.length]; double preds[] = new double[_parms._k]; for (int row = 0; row < chks[0]._len; row++) { double p[] = score0(chks, row, tmp, preds); for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]); } } }.doAll(adaptFrm); // Return the projection into principal component space int x = _output._names.length, y = adaptFrm.numCols(); Frame f = adaptFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame( (null == destination_key ? Key.make() : Key.make(destination_key)), f.names(), f.vecs()); DKV.put(f); makeMetricBuilder(null).makeModelMetrics(this, orig); return f; }
@Override public void map(Chunk[] ix, NewChunk[] ncs) { final Vec[] vecs = new Vec[_cols.length]; final Vec anyv = _base.anyVec(); final long nrow = anyv.length(); long r = ix[0].at80(0); int last_ci = anyv.elem2ChunkIdx(r < nrow ? r : 0); // memoize the last chunk index long last_c0 = anyv._espc[last_ci]; // ... last chunk start long last_c1 = anyv._espc[last_ci + 1]; // ... last chunk end Chunk[] last_cs = new Chunk[vecs.length]; // ... last chunks for (int c = 0; c < _cols.length; c++) { vecs[c] = _base.vecs()[_cols[c]]; last_cs[c] = vecs[c].elem2BV(last_ci); } for (int i = 0; i < ix[0]._len; i++) { // select one row r = ix[0].at80(i) - 1; // next row to select if (r < 0) continue; if (r >= nrow) { for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN); } else { if (r < last_c0 || r >= last_c1) { last_ci = anyv.elem2ChunkIdx(r); last_c0 = anyv._espc[last_ci]; last_c1 = anyv._espc[last_ci + 1]; for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].elem2BV(last_ci); } for (int c = 0; c < vecs.length; c++) ncs[c].addNum(last_cs[c].at(r)); } } }
static Frame exec_str(String str, String id) { Val val = Exec.exec(str); switch (val.type()) { case Val.FRM: Frame fr = val.getFrame(); Key k = Key.make(id); // Smart delete any prior top-level result Iced i = DKV.getGet(k); if (i instanceof Lockable) ((Lockable) i).delete(); else if (i instanceof Keyed) ((Keyed) i).remove(); else if (i != null) throw new IllegalArgumentException("Attempting to overright an unexpected key"); DKV.put(fr = new Frame(k, fr._names, fr.vecs())); System.out.println(fr); checkSaneFrame(); return fr; case Val.NUM: System.out.println("num= " + val.getNum()); assert id == null; checkSaneFrame(); return null; case Val.STR: System.out.println("str= " + val.getStr()); assert id == null; checkSaneFrame(); return null; default: throw water.H2O.fail(); } }
DRFTree(Frame fr, int ncols, char nbins, char nclass, int min_rows, int mtrys, long seed) { super(fr._names, ncols, nbins, nclass, min_rows, seed); _mtrys = mtrys; _rand = createRNG(seed); _seeds = new long[fr.vecs()[0].nChunks()]; for (int i = 0; i < _seeds.length; i++) _seeds[i] = _rand.nextLong(); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Val v = stk.track(asts[1].exec(env)); if (v instanceof ValRow) { ValRow vv = (ValRow) v; return vv.slice(asts[2].columns(vv._names)); } Frame fr = v.getFrame(); int[] cols = asts[2].columns(fr.names()); Frame fr2 = new Frame(); if (cols.length == 0) { // Empty inclusion list? } else if (cols[0] >= 0) { // Positive (inclusion) list if (cols[cols.length - 1] > fr.numCols()) throw new IllegalArgumentException( "Column must be an integer from 0 to " + (fr.numCols() - 1)); for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]); } else { // Negative (exclusion) list fr2 = new Frame(fr); // All of them at first Arrays.sort(cols); // This loop depends on the values in sorted order for (int col : cols) if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column } return new ValFrame(fr2); }
@Override protected void init() { if (validation != null && n_folds != 0) throw new UnsupportedOperationException( "Cannot specify a validation dataset and non-zero number of cross-validation folds."); if (n_folds < 0) throw new UnsupportedOperationException( "The number of cross-validation folds must be >= 0."); super.init(); xval_models = new Key[n_folds]; for (int i = 0; i < xval_models.length; ++i) xval_models[i] = Key.make(dest().toString() + "_xval" + i); int rIndex = 0; for (int i = 0; i < source.vecs().length; i++) if (source.vecs()[i] == response) { rIndex = i; break; } _responseName = source._names != null && rIndex >= 0 ? source._names[rIndex] : "response"; _train = selectVecs(source); _names = new String[cols.length]; for (int i = 0; i < cols.length; i++) _names[i] = source._names[cols[i]]; // Compute source response domain if (classification) _sourceResponseDomain = getVectorDomain(response); // Is validation specified? if (validation != null) { // Extract a validation response int idx = validation.find(source.names()[rIndex]); if (idx == -1) throw new IllegalArgumentException( "Validation set does not have a response column called " + _responseName); _validResponse = validation.vecs()[idx]; // Compute output confusion matrix domain for classification: // - if validation dataset is specified then CM domain is union of train and validation // response domains // else it is only domain of response column. if (classification) { _validResponseDomain = getVectorDomain(_validResponse); if (_validResponseDomain != null) { _cmDomain = Utils.domainUnion(_sourceResponseDomain, _validResponseDomain); if (!Arrays.deepEquals(_sourceResponseDomain, _validResponseDomain)) { _fromModel2CM = Model.getDomainMapping( _cmDomain, _sourceResponseDomain, false); // transformation from model produced response ~> cmDomain _fromValid2CM = Model.getDomainMapping( _cmDomain, _validResponseDomain, false); // transformation from validation response domain ~> cmDomain } } else _cmDomain = _sourceResponseDomain; } /* end of if classification */ } else if (classification) _cmDomain = _sourceResponseDomain; }
@Override public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) { Frame f = stk.track(asts[1].exec(env)).getFrame(); AstRoot axisAR = asts[2]; for (Vec v : f.vecs()) { if (v.isCategorical() || v.isString() || v.isUUID()) throw new IllegalArgumentException( "Cumulative functions not applicable to enum, string, or UUID values"); } double axis = axisAR.exec(env).getNum(); if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1"); if (f.numCols() == 1) { if (axis == 0.0) { AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init()); t.doAll(new byte[] {Vec.T_NUM}, f.anyVec()); final double[] chkCumu = t._chkCumu; Vec cumuVec = t.outputFrame().anyVec(); new MRTask() { @Override public void map(Chunk c) { if (c.cidx() != 0) { double d = chkCumu[c.cidx() - 1]; for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d)); } } }.doAll(cumuVec); return new ValFrame(new Frame(cumuVec)); } else { return new ValFrame(new Frame(f)); } } else { if (axis == 0.0) { // down the column implementation AstCumu.CumuTaskWholeFrame t = new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols()); Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null); final double[][] chkCumu = t._chkCumu; new MRTask() { @Override public void map(Chunk cs[]) { if (cs[0].cidx() != 0) { for (int i = 0; i < cs.length; i++) { double d = chkCumu[i][cs[i].cidx() - 1]; for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d)); } } } }.doAll(fr2); return new ValFrame(new Frame(fr2)); } else { AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init()); Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null); return new ValFrame(new Frame(fr2)); } } }
protected final Frame selectFrame(Frame frame) { Vec[] vecs = new Vec[cols.length]; String[] names = new String[cols.length]; for( int i = 0; i < cols.length; i++ ) { vecs[i] = frame.vecs()[cols[i]]; names[i] = frame.names()[cols[i]]; } return new Frame(names, vecs); }
/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
@Override public void compute2() { // Lock all possible data dataset.read_lock(jobKey); // Create a template vector for each segment final Vec[][] templates = makeTemplates(dataset, ratios); final int nsplits = templates.length; assert nsplits == ratios.length + 1 : "Unexpected number of split templates!"; // Launch number of distributed FJ for each split part final Vec[] datasetVecs = dataset.vecs(); splits = new Frame[nsplits]; for (int s = 0; s < nsplits; s++) { Frame split = new Frame(destKeys[s], dataset.names(), templates[s]); split.delete_and_lock(jobKey); splits[s] = split; } setPendingCount(1); H2O.submitTask( new H2OCountedCompleter(FrameSplitter.this) { @Override public void compute2() { setPendingCount(nsplits); for (int s = 0; s < nsplits; s++) { new FrameSplitTask( new H2OCountedCompleter(this) { // Completer for this task @Override public void compute2() {} @Override public boolean onExceptionalCompletion( Throwable ex, CountedCompleter caller) { synchronized ( FrameSplitter .this) { // synchronized on this since can be accessed from // different workers workersExceptions = workersExceptions != null ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1) : new Throwable[1]; workersExceptions[workersExceptions.length - 1] = ex; } tryComplete(); // we handle the exception so wait perform normal // completion return false; } }, datasetVecs, ratios, s) .asyncExec(splits[s]); } tryComplete(); // complete the computation of nsplits-tasks } }); tryComplete(); // complete the computation of thrown tasks }
// -------------------------------------------------------------------------- // Build an entire layer of all K trees protected DHistogram[][][] buildLayer( final Frame fr, final int nbins, int nbins_cats, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) { // Build K trees, one per class. // Build up the next-generation tree splits from the current histograms. // Nearly all leaves will split one more level. This loop nest is // O( #active_splits * #bins * #ncols ) // but is NOT over all the data. ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass]; Vec vecs[] = fr.vecs(); for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; // Build a frame with just a single tree (& work & nid) columns, so the // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try // to close other tree's Vecs when run in parallel. Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1)); fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]); fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]); fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]); if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]); // Start building one of the K trees in parallel H2O.submitTask( sb1ts[k] = new ScoreBuildOneTree( this, k, nbins, nbins_cats, tree, leafs, hcs, fr2, subset, build_tree_one_node, _improvPerVar, _model._parms._distribution)); } // Block for all K trees to complete. boolean did_split = false; for (int k = 0; k < _nclass; k++) { final DTree tree = ktrees[k]; // Tree for class K if (tree == null) continue; sb1ts[k].join(); if (sb1ts[k]._did_split) did_split = true; } // The layer is done. return did_split ? hcs : null; }
@Override ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); double frac = asts[2].exec(env).getNum(); double nrow = fr.numRows() * frac; Vec vecs[] = fr.vecs(); long[] idxs = new long[fr.numCols()]; int j = 0; for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i; Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec()); return new ValFrame(new Frame(vec)); }
public final String[] coefNames() { int k = 0; final int n = fullN(); String[] res = new String[n]; final Vec[] vecs = _adaptedFrame.vecs(); for (int i = 0; i < _cats; ++i) for (int j = 1; j < vecs[i]._domain.length; ++j) res[k++] = _adaptedFrame._names[i] + "." + vecs[i]._domain[j]; final int nums = n - k; for (int i = 0; i < nums; ++i) res[k + i] = _adaptedFrame._names[_cats + i]; return res; }
// Scalar covariance for 1 row private ValNum scalar(Frame frx, Frame fry, Mode mode) { if (frx.numCols() != fry.numCols()) throw new IllegalArgumentException( "Single rows must have the same number of columns, found " + frx.numCols() + " and " + fry.numCols()); Vec vecxs[] = frx.vecs(); Vec vecys[] = fry.vecs(); double xmean = 0, ymean = 0, ncols = frx.numCols(), NACount = 0, xval, yval, ss = 0; for (int r = 0; r < ncols; r++) { xval = vecxs[r].at(0); yval = vecys[r].at(0); if (Double.isNaN(xval) || Double.isNaN(yval)) NACount++; else { xmean += xval; ymean += yval; } } xmean /= (ncols - NACount); ymean /= (ncols - NACount); if (NACount != 0) { if (mode.equals(Mode.AllObs)) throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present"); if (mode.equals(Mode.Everything)) return new ValNum(Double.NaN); } for (int r = 0; r < ncols; r++) { xval = vecxs[r].at(0); yval = vecys[r].at(0); if (!(Double.isNaN(xval) || Double.isNaN(yval))) ss += (vecxs[r].at(0) - xmean) * (vecys[r].at(0) - ymean); } return new ValNum(ss / (ncols - NACount - 1)); }
@Test public void test() { Frame frame = null; try { Futures fs = new Futures(); Random random = new Random(); Vec[] vecs = new Vec[1]; AppendableVec vec = new AppendableVec(Vec.newKey(), Vec.T_NUM); for (int i = 0; i < 2; i++) { NewChunk chunk = new NewChunk(vec, i); for (int r = 0; r < 1000; r++) chunk.addNum(random.nextInt(1000)); chunk.close(i, fs); } vecs[0] = vec.layout_and_close(fs); fs.blockForPending(); frame = new Frame(Key.<Frame>make(), null, vecs); // Make sure we test the multi-chunk case vecs = frame.vecs(); assert vecs[0].nChunks() > 1; long rows = frame.numRows(); Vec v = vecs[0]; double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY, mean = 0, sigma = 0; for (int r = 0; r < rows; r++) { double d = v.at(r); if (d < min) min = d; if (d > max) max = d; mean += d; } mean /= rows; for (int r = 0; r < rows; r++) { double d = v.at(r); sigma += (d - mean) * (d - mean); } sigma = Math.sqrt(sigma / (rows - 1)); double epsilon = 1e-9; assertEquals(max, v.max(), epsilon); assertEquals(min, v.min(), epsilon); assertEquals(mean, v.mean(), epsilon); assertEquals(sigma, v.sigma(), epsilon); } finally { if (frame != null) frame.delete(); } }
// ========================================================================== public void basicGBM(String fname, String hexname, PrepData prep) { File file = TestUtil.find_test_file(fname); if (file == null) return; // Silently abort test if the file is missing Key fkey = NFSFileVec.make(file); Key dest = Key.make(hexname); GBM gbm = null; Frame fr = null; try { gbm = new GBM(); gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey}); UKV.remove(fkey); int idx = prep.prep(fr); if (idx < 0) { gbm.classification = false; idx = ~idx; } String rname = fr._names[idx]; gbm.response = fr.vecs()[idx]; fr.remove(idx); // Move response to the end fr.add(rname, gbm.response); gbm.ntrees = 4; gbm.max_depth = 4; gbm.min_rows = 1; gbm.nbins = 50; gbm.cols = new int[fr.numCols()]; for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i; gbm.learn_rate = .2f; gbm.invoke(); fr = gbm.score(gbm.source); GBM.GBMModel gbmmodel = UKV.get(gbm.dest()); // System.out.println(gbmmodel.toJava()); } finally { UKV.remove(dest); // Remove original hex frame key if (gbm != null) { UKV.remove(gbm.dest()); // Remove the model UKV.remove(gbm.response._key); gbm.remove(); // Remove GBM Job if (fr != null) fr.remove(); } } }
public DataInfo validDinfo(Frame valid) { DataInfo res = new DataInfo( _adaptedFrame, null, 1, _useAllFactorLevels, TransformType.NONE, TransformType.NONE, _skipMissing, _imputeMissing, false, _weights, _offset, _fold); res._adaptedFrame = new Frame(_adaptedFrame.names(), valid.vecs(_adaptedFrame.names())); res._valid = true; return res; }
public final String[] coefNames() { if (_coefNames != null) return _coefNames; int k = 0; final int n = fullN(); String[] res = new String[n]; final Vec[] vecs = _adaptedFrame.vecs(); for (int i = 0; i < _cats; ++i) { for (int j = _useAllFactorLevels ? 0 : 1; j < vecs[i].domain().length; ++j) { int jj = getCategoricalId(i, j); if (jj < 0) continue; res[k++] = _adaptedFrame._names[i] + "." + vecs[i].domain()[j]; } if (_catMissing[i] > 0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)"; } final int nums = n - k; System.arraycopy(_adaptedFrame._names, _cats, res, k, nums); _coefNames = res; return res; }
public final String[] coefNames() { if (_coefNames != null) return _coefNames; // already computed int k = 0; final int n = fullN(); // total number of columns to compute String[] res = new String[n]; final Vec[] vecs = _adaptedFrame.vecs(); // first do all of the expanded categorical names for (int i = 0; i < _cats; ++i) { for (int j = (_useAllFactorLevels || vecs[i] instanceof InteractionWrappedVec) ? 0 : 1; j < vecs[i].domain().length; ++j) { int jj = getCategoricalId(i, j); if (jj < 0) continue; res[k++] = _adaptedFrame._names[i] + "." + vecs[i].domain()[j]; } if (_catMissing[i] && getCategoricalId(i, _catModes[i]) >= 0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)"; } // now loop over the numerical columns, collecting up any expanded InteractionVec names if (_interactions == null) { final int nums = n - k; System.arraycopy(_adaptedFrame._names, _cats, res, k, nums); } else { for (int i = _cats; i < _nums; ++i) { InteractionWrappedVec v; if (vecs[i] instanceof InteractionWrappedVec && ((v = (InteractionWrappedVec) vecs[i]).domains() != null)) { // in this case, get the categoricalOffset for (int j = 0; k < v.domains().length; ++j) { if (getCategoricalIdFromInteraction(i, j) < 0) continue; res[k++] = _adaptedFrame._names[i] + "." + v.domains()[j]; } } else res[k++] = _adaptedFrame._names[i]; } } _coefNames = res; return res; }
public static Frame expandDataset(Frame fr, Key destkey) { // , int[] ignored) { ArrayList<Vec> nvecs = new ArrayList<Vec>(); ArrayList<Vec> evecs = new ArrayList<Vec>(); ArrayList<String> eNames = new ArrayList<String>(); ArrayList<String> nNames = new ArrayList<String>(); int[] offsets = new int[fr.numCols() + 1]; Vec[] vecs = fr.vecs(); int c = 0; // int ip = 0; //ignored pointer for (int i = 0; i < fr.numCols(); i++) { if (vecs[i] .isEnum()) { // && i != ignored[ip]) {//!fr._names {//_names[i]. { //equals(ignored)) { offsets[evecs.size()] = c; evecs.add(vecs[i]); String name = fr._names[i]; c += vecs[i]._domain.length; for (String s : vecs[i]._domain) eNames.add(name + "." + s); } else { // if(i == ignored[ip] && ip < ignored.length - 1) ip++; nvecs.add(vecs[i]); nNames.add(fr._names[i]); } } offsets[evecs.size()] = c; if (evecs.isEmpty()) return fr; offsets = Arrays.copyOf(offsets, evecs.size() + 1); OneHot ss = new OneHot(); ss._offsets = offsets; int l = offsets[evecs.size()]; ss.doAll(l, evecs.toArray(new Vec[evecs.size()])); Frame fr2 = ss.outputFrame(destkey, eNames.toArray(new String[eNames.size()]), new String[l][]); fr2.add( new Frame(nNames.toArray(new String[nNames.size()]), nvecs.toArray(new Vec[nvecs.size()])), false); return fr2; }
static boolean checkSaneFrame_impl() { for (Key k : H2O.localKeySet()) { Value val = H2O.raw_get(k); if (val.isFrame()) { Frame fr = val.get(); Vec vecs[] = fr.vecs(); for (int i = 0; i < vecs.length; i++) { Vec v = vecs[i]; if (DKV.get(v._key) == null) { System.err.println( "Frame " + fr._key + " in the DKV, is missing Vec " + v._key + ", name=" + fr._names[i]); return false; } } } } return true; }
@Override protected void init() { super.init(); int rIndex = 0; for( int i = 0; i < source.vecs().length; i++ ) if( source.vecs()[i] == response ) rIndex = i; _responseName = source._names != null && rIndex >= 0 ? source._names[rIndex] : "response"; _train = selectVecs(source); _names = new String[cols.length]; for( int i = 0; i < cols.length; i++ ) _names[i] = source._names[cols[i]]; // Compute source response domain if (classification) _sourceResponseDomain = getVectorDomain(response); // Is validation specified? if( validation != null ) { // Extract a validation response int idx = validation.find(source.names()[rIndex]); if( idx == -1 ) throw new IllegalArgumentException("Validation set does not have a response column called "+_responseName); _validResponse = validation.vecs()[idx]; // Compute output confusion matrix domain for classification: // - if validation dataset is specified then CM domain is union of train and validation response domains // else it is only domain of response column. if (classification) { _validResponseDomain = getVectorDomain(_validResponse); if (_validResponseDomain!=null) { _cmDomain = Utils.domainUnion(_sourceResponseDomain, _validResponseDomain); if (!Arrays.deepEquals(_sourceResponseDomain, _validResponseDomain)) { _fromModel2CM = Model.getDomainMapping(_cmDomain, _sourceResponseDomain, false); // transformation from model produced response ~> cmDomain _fromValid2CM = Model.getDomainMapping(_cmDomain, _validResponseDomain , false); // transformation from validation response domain ~> cmDomain } } else _cmDomain = _sourceResponseDomain; } /* end of if classification */ } else if (classification) _cmDomain = _sourceResponseDomain; }
protected final Vec[] selectVecs(Frame frame) { Vec[] vecs = new Vec[cols.length]; for( int i = 0; i < cols.length; i++ ) vecs[i] = frame.vecs()[cols[i]]; return vecs; }
public static boolean checkIdx(Frame source, int[] idx) { for (int i : idx) if (i<0 || i>source.vecs().length-1) return false; return true; }
/** Put given frame vectors into local trash which can be emptied by a user calling the {@link #emptyLTrash()} method. * @see #emptyLTrash() */ protected final void ltrash(Frame fr) { ltrash(fr.vecs()); }
public static Frame[] shuffleSplitFrame( Frame fr, Key[] keys, final double ratios[], final long seed) { // Sanity check the ratios assert keys.length == ratios.length; double sum = ratios[0]; for (int i = 1; i < ratios.length; i++) { sum += ratios[i]; ratios[i] = sum; } assert water.util.MathUtils.equalsWithinOneSmallUlp(sum, 1.0); // Do the split, into ratios.length groupings of NewChunks final int ncols = fr.numCols(); MRTask mr = new MRTask() { @Override public void map(Chunk cs[], NewChunk ncs[]) { Random rng = new Random(seed * cs[0].cidx()); int nrows = cs[0]._len; for (int i = 0; i < nrows; i++) { double r = rng.nextDouble(); int x = 0; // Pick the NewChunk split for (; x < ratios.length - 1; x++) if (r < ratios[x]) break; x *= ncols; // Helper string holder ValueString vstr = new ValueString(); // Copy row to correct set of NewChunks for (int j = 0; j < ncols; j++) { byte colType = cs[j].vec().get_type(); switch (colType) { case Vec.T_BAD: break; /* NOP */ case Vec.T_STR: ncs[x + j].addStr(cs[j], i); break; case Vec.T_UUID: ncs[x + j].addUUID(cs[j], i); break; case Vec.T_NUM: /* fallthrough */ case Vec.T_ENUM: case Vec.T_TIME: ncs[x + j].addNum(cs[j].atd(i)); break; default: if (colType > Vec.T_TIME && colType <= Vec.T_TIMELAST) ncs[x + j].addNum(cs[j].atd(i)); else throw new IllegalArgumentException("Unsupported vector type: " + colType); break; } } } } }.doAll(ncols * ratios.length, fr); // Build output frames Frame frames[] = new Frame[ratios.length]; Vec[] vecs = fr.vecs(); String[] names = fr.names(); Futures fs = new Futures(); for (int i = 0; i < ratios.length; i++) { Vec[] nvecs = new Vec[ncols]; for (int c = 0; c < ncols; c++) { mr.appendables()[i * ncols + c].setDomain(vecs[c].domain()); nvecs[c] = mr.appendables()[i * ncols + c].close(fs); } frames[i] = new Frame(keys[i], fr.names(), nvecs); DKV.put(frames[i], fs); } fs.blockForPending(); return frames; }
public Frame(Frame fr) { this(fr._names.clone(), fr.vecs().clone()); _col0 = null; }