@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Val v = stk.track(asts[1].exec(env)); if (v instanceof ValRow) { ValRow vv = (ValRow) v; return vv.slice(asts[2].columns(vv._names)); } Frame fr = v.getFrame(); int[] cols = asts[2].columns(fr.names()); Frame fr2 = new Frame(); if (cols.length == 0) { // Empty inclusion list? } else if (cols[0] >= 0) { // Positive (inclusion) list if (cols[cols.length - 1] > fr.numCols()) throw new IllegalArgumentException( "Column must be an integer from 0 to " + (fr.numCols() - 1)); for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]); } else { // Negative (exclusion) list fr2 = new Frame(fr); // All of them at first Arrays.sort(cols); // This loop depends on the values in sorted order for (int col : cols) if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column } return new ValFrame(fr2); }
@Override public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) { Frame f = stk.track(asts[1].exec(env)).getFrame(); AstRoot axisAR = asts[2]; for (Vec v : f.vecs()) { if (v.isCategorical() || v.isString() || v.isUUID()) throw new IllegalArgumentException( "Cumulative functions not applicable to enum, string, or UUID values"); } double axis = axisAR.exec(env).getNum(); if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1"); if (f.numCols() == 1) { if (axis == 0.0) { AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init()); t.doAll(new byte[] {Vec.T_NUM}, f.anyVec()); final double[] chkCumu = t._chkCumu; Vec cumuVec = t.outputFrame().anyVec(); new MRTask() { @Override public void map(Chunk c) { if (c.cidx() != 0) { double d = chkCumu[c.cidx() - 1]; for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d)); } } }.doAll(cumuVec); return new ValFrame(new Frame(cumuVec)); } else { return new ValFrame(new Frame(f)); } } else { if (axis == 0.0) { // down the column implementation AstCumu.CumuTaskWholeFrame t = new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols()); Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null); final double[][] chkCumu = t._chkCumu; new MRTask() { @Override public void map(Chunk cs[]) { if (cs[0].cidx() != 0) { for (int i = 0; i < cs.length; i++) { double d = chkCumu[i][cs[i].cidx() - 1]; for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d)); } } } }.doAll(fr2); return new ValFrame(new Frame(fr2)); } else { AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init()); Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null); return new ValFrame(new Frame(fr2)); } } }
@Test public void testColumnwisesumBinaryVec() { assertTrue(vc1.isBinary() && !vc2.isBinary()); Frame fr = register(new Frame(Key.<Frame>make(), ar("C1", "C2"), aro(vc1, vc2))); Val val = Rapids.exec("(sumaxis " + fr._key + " 1 0)"); assertTrue(val instanceof ValFrame); Frame res = register(val.getFrame()); assertArrayEquals(fr.names(), res.names()); assertArrayEquals(ar(Vec.T_NUM, Vec.T_NUM), res.types()); assertRowFrameEquals(ard(3.0, Double.NaN), res); }
@Override protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) { Frame adaptFrm = new Frame(adaptedFr); for (int i = 0; i < _parms._k; i++) adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero()); new MRTask() { @Override public void map(Chunk chks[]) { double tmp[] = new double[_output._names.length]; double preds[] = new double[_parms._k]; for (int row = 0; row < chks[0]._len; row++) { double p[] = score0(chks, row, tmp, preds); for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]); } } }.doAll(adaptFrm); // Return the projection into principal component space int x = _output._names.length, y = adaptFrm.numCols(); Frame f = adaptFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame( (null == destination_key ? Key.make() : Key.make(destination_key)), f.names(), f.vecs()); DKV.put(f); makeMetricBuilder(null).makeModelMetrics(this, orig); return f; }
/** * Project each archetype into original feature space * * @param frame Original training data with m rows and n columns * @param destination_key Frame Id for output * @return Frame containing k rows and n columns, where each row corresponds to an archetype */ public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) { final int ncols = _output._names.length; Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); String[][] adaptedDomme = adaptedFr.domains(); double[][] proj = new double[_parms._k][_output._nnums + _output._ncats]; // Categorical columns for (int d = 0; d < _output._ncats; d++) { double[][] block = _output._archetypes_raw.getCatBlock(d); for (int k = 0; k < _parms._k; k++) proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]); } // Numeric columns for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) { int ds = d - _output._ncats; for (int k = 0; k < _parms._k; k++) { double num = _output._archetypes_raw.getNum(ds, k); proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]); if (reverse_transform) proj[k][_output._permutation[d]] = proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds]; } } // Convert projection of archetypes into a frame with correct domains Frame f = ArrayUtils.frame( (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj); for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]); return f; }
/** * Sample rows from a frame. Can be unlucky for small sampling fractions - will continue calling * itself until at least 1 row is returned. * * @param fr Input frame * @param rows Approximate number of rows to sample (across all chunks) * @param seed Seed for RNG * @return Sampled frame */ public static Frame sampleFrame(Frame fr, final long rows, final long seed) { if (fr == null) return null; final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f; if (fraction >= 1.f) return fr; Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { final Random rng = getDeterRNG(seed + cs[0].cidx()); int count = 0; for (int r = 0; r < cs[0]._len; r++) if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) { count++; for (int i = 0; i < ncs.length; i++) { ncs[i].addNum(cs[i].at0(r)); } } } }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains()); if (r.numRows() == 0) { Log.warn( "You asked for " + rows + " rows (out of " + fr.numRows() + "), but you got none (seed=" + seed + ")."); Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\""); return sampleFrame(fr, rows, seed + 1); } return r; }
@Override public void compute2() { _in.read_lock(_jobKey); // simply create a bogus new vector (don't even put it into KV) with appropriate number of lines // per chunk and then use it as a source to do multiple makeZero calls // to create empty vecs and than call RebalanceTask on each one of them. // RebalanceTask will fetch the appropriate src chunks and fetch the data from them. int rpc = (int) (_in.numRows() / _nchunks); int rem = (int) (_in.numRows() % _nchunks); long[] espc = new long[_nchunks + 1]; Arrays.fill(espc, rpc); for (int i = 0; i < rem; ++i) ++espc[i]; long sum = 0; for (int i = 0; i < espc.length; ++i) { long s = espc[i]; espc[i] = sum; sum += s; } assert espc[espc.length - 1] == _in.numRows() : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1]; final Vec[] srcVecs = _in.vecs(); _out = new Frame( _okey, _in.names(), new Vec(Vec.newKey(), espc).makeZeros(srcVecs.length, _in.domains())); _out.delete_and_lock(_jobKey); new RebalanceTask(this, srcVecs).asyncExec(_out); }
@Test public void testColumnwiseSumWithNaRm() { Frame fr = register( new Frame( Key.<Frame>make(), ar("I", "D", "DD", "DN", "T", "S", "C"), aro(vi1, vd1, vd2, vd3, vt1, vs1, vc2))); Val val = Rapids.exec("(sumaxis " + fr._key + " 1 0)"); assertTrue(val instanceof ValFrame); Frame res = register(val.getFrame()); assertArrayEquals(fr.names(), res.names()); assertArrayEquals( ar(Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_TIME, Vec.T_NUM, Vec.T_NUM), res.types()); assertRowFrameEquals(ard(0.0, 20.0, 3.0, 6.0, 50000150.0, Double.NaN, Double.NaN), res); }
protected final Frame selectFrame(Frame frame) { Vec[] vecs = new Vec[cols.length]; String[] names = new String[cols.length]; for( int i = 0; i < cols.length; i++ ) { vecs[i] = frame.vecs()[cols[i]]; names[i] = frame.names()[cols[i]]; } return new Frame(names, vecs); }
@Override public void compute2() { // Lock all possible data dataset.read_lock(jobKey); // Create a template vector for each segment final Vec[][] templates = makeTemplates(dataset, ratios); final int nsplits = templates.length; assert nsplits == ratios.length + 1 : "Unexpected number of split templates!"; // Launch number of distributed FJ for each split part final Vec[] datasetVecs = dataset.vecs(); splits = new Frame[nsplits]; for (int s = 0; s < nsplits; s++) { Frame split = new Frame(destKeys[s], dataset.names(), templates[s]); split.delete_and_lock(jobKey); splits[s] = split; } setPendingCount(1); H2O.submitTask( new H2OCountedCompleter(FrameSplitter.this) { @Override public void compute2() { setPendingCount(nsplits); for (int s = 0; s < nsplits; s++) { new FrameSplitTask( new H2OCountedCompleter(this) { // Completer for this task @Override public void compute2() {} @Override public boolean onExceptionalCompletion( Throwable ex, CountedCompleter caller) { synchronized ( FrameSplitter .this) { // synchronized on this since can be accessed from // different workers workersExceptions = workersExceptions != null ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1) : new Throwable[1]; workersExceptions[workersExceptions.length - 1] = ex; } tryComplete(); // we handle the exception so wait perform normal // completion return false; } }, datasetVecs, ratios, s) .asyncExec(splits[s]); } tryComplete(); // complete the computation of nsplits-tasks } }); tryComplete(); // complete the computation of thrown tasks }
public DataInfo validDinfo(Frame valid) { DataInfo res = new DataInfo( _adaptedFrame, null, 1, _useAllFactorLevels, TransformType.NONE, TransformType.NONE, _skipMissing, _imputeMissing, false, _weights, _offset, _fold); res._adaptedFrame = new Frame(_adaptedFrame.names(), valid.vecs(_adaptedFrame.names())); res._valid = true; return res; }
public static Frame shuffleFramePerChunk(Key outputFrameKey, Frame fr, final long seed) { Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { long[] idx = new long[cs[0]._len]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); for (int r = 0; r < idx.length; ++r) { for (int i = 0; i < ncs.length; i++) { ncs[i].addNum(cs[i].at0((int) idx[r])); } } } }.doAll(fr.numCols(), fr).outputFrame(outputFrameKey, fr.names(), fr.domains()); return r; }
/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
private void applyTrainingFrameSideEffects() { int numCols = _modelBuilderTrain.numCols(); String responseVecName = _modelBuilderTrain.names()[numCols - 1]; Vec responseVec = _modelBuilderTrain.remove(numCols - 1); final boolean use_weights_column = (_parms.weights_column != null); final boolean use_start_column = (_parms.start_column != null); if (use_weights_column) { Vec weightsVec = _parms.weights_column; int idxInRawFrame = _train.find(weightsVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find weightVec"); } String weightsVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(weightsVecName, weightsVec); } if (use_start_column) { Vec startVec = _parms.start_column; int idxInRawFrame = _train.find(startVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find startVec"); } String startVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(startVecName, startVec); } { Vec stopVec = _parms.stop_column; int idxInRawFrame = _train.find(stopVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find stopVec"); } String stopVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(stopVecName, stopVec); } _modelBuilderTrain.add(responseVecName, responseVec); }
private void applyScoringFrameSideEffects() { final int offset_ncol = _parms.offset_columns == null ? 0 : _parms.offset_columns.length; if (offset_ncol == 0) { return; } int numCols = _modelBuilderTrain.numCols(); String responseVecName = _modelBuilderTrain.names()[numCols - 1]; Vec responseVec = _modelBuilderTrain.remove(numCols - 1); for (int i = 0; i < offset_ncol; i++) { Vec offsetVec = _parms.offset_columns[i]; int idxInRawFrame = _train.find(offsetVec); if (idxInRawFrame < 0) { throw new RuntimeException("CoxPHDriver failed to find offsetVec"); } String offsetVecName = _parms.train().names()[idxInRawFrame]; _modelBuilderTrain.add(offsetVecName, offsetVec); } _modelBuilderTrain.add(responseVecName, responseVec); }
// GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell // (2015), Section 5.3) private Frame reconstruct( Frame orig, Frame adaptedFr, Key destination_key, boolean save_imputed, boolean reverse_transform) { final int ncols = _output._names.length; assert ncols == adaptedFr.numCols(); String prefix = "reconstr_"; // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame // Note: A is adapted to original training frame, P has columns shuffled so cats come before // nums! Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); String[][] adaptedDomme = adaptedFr.domains(); for (int i = 0; i < ncols; i++) { Vec v = fullFrm.anyVec().makeZero(); v.setDomain(adaptedDomme[i]); fullFrm.add(prefix + _output._names[i], v); } GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm); // Return the imputed training frame int x = ncols + _parms._k, y = fullFrm.numCols(); Frame f = fullFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs()); DKV.put(f); gs._mb.makeModelMetrics( GLRMModel.this, orig, null, null); // save error metrics based on imputed data return f; }
public static Frame[] shuffleSplitFrame( Frame fr, Key[] keys, final double ratios[], final long seed) { // Sanity check the ratios assert keys.length == ratios.length; double sum = ratios[0]; for (int i = 1; i < ratios.length; i++) { sum += ratios[i]; ratios[i] = sum; } assert water.util.MathUtils.equalsWithinOneSmallUlp(sum, 1.0); // Do the split, into ratios.length groupings of NewChunks final int ncols = fr.numCols(); MRTask mr = new MRTask() { @Override public void map(Chunk cs[], NewChunk ncs[]) { Random rng = new Random(seed * cs[0].cidx()); int nrows = cs[0]._len; for (int i = 0; i < nrows; i++) { double r = rng.nextDouble(); int x = 0; // Pick the NewChunk split for (; x < ratios.length - 1; x++) if (r < ratios[x]) break; x *= ncols; // Helper string holder ValueString vstr = new ValueString(); // Copy row to correct set of NewChunks for (int j = 0; j < ncols; j++) { byte colType = cs[j].vec().get_type(); switch (colType) { case Vec.T_BAD: break; /* NOP */ case Vec.T_STR: ncs[x + j].addStr(cs[j], i); break; case Vec.T_UUID: ncs[x + j].addUUID(cs[j], i); break; case Vec.T_NUM: /* fallthrough */ case Vec.T_ENUM: case Vec.T_TIME: ncs[x + j].addNum(cs[j].atd(i)); break; default: if (colType > Vec.T_TIME && colType <= Vec.T_TIMELAST) ncs[x + j].addNum(cs[j].atd(i)); else throw new IllegalArgumentException("Unsupported vector type: " + colType); break; } } } } }.doAll(ncols * ratios.length, fr); // Build output frames Frame frames[] = new Frame[ratios.length]; Vec[] vecs = fr.vecs(); String[] names = fr.names(); Futures fs = new Futures(); for (int i = 0; i < ratios.length; i++) { Vec[] nvecs = new Vec[ncols]; for (int c = 0; c < ncols; c++) { mr.appendables()[i * ncols + c].setDomain(vecs[c].domain()); nvecs[c] = mr.appendables()[i * ncols + c].close(fs); } frames[i] = new Frame(keys[i], fr.names(), nvecs); DKV.put(frames[i], fs); } fs.blockForPending(); return frames; }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); Frame returningFrame; long nrows = fr.numRows(); if (asts[2] instanceof ASTNumList) { final ASTNumList nums = (ASTNumList) asts[2]; long[] rows = nums._isList ? nums.expand8Sort() : null; if (rows != null) { if (rows.length == 0) { // Empty inclusion list? } else if (rows[0] >= 0) { // Positive (inclusion) list if (rows[rows.length - 1] > nrows) throw new IllegalArgumentException("Row must be an integer from 0 to " + (nrows - 1)); } else { // Negative (exclusion) list // Invert the list to make a positive list, ignoring out-of-bounds values BitSet bs = new BitSet((int) nrows); for (int i = 0; i < rows.length; i++) { int idx = (int) (-rows[i] - 1); // The positive index if (idx >= 0 && idx < nrows) bs.set(idx); // Set column to EXCLUDE } rows = new long[(int) nrows - bs.cardinality()]; for (int i = bs.nextClearBit(0), j = 0; i < nrows; i = bs.nextClearBit(i + 1)) rows[j++] = i; } } final long[] ls = rows; returningFrame = new MRTask() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { if (nums.cnt() == 0) return; long start = cs[0].start(); long end = start + cs[0]._len; long min = ls == null ? (long) nums.min() : ls[0], max = ls == null ? (long) nums.max() - 1 : ls[ls.length - 1]; // exclusive max to inclusive max when stride == 1 // [ start, ..., end ] the chunk // 1 [] nums out left: nums.max() < start // 2 [] nums out rite: nums.min() > end // 3 [ nums ] nums run left: nums.min() < start && nums.max() <= // end // 4 [ nums ] nums run in : start <= nums.min() && nums.max() <= // end // 5 [ nums ] nums run rite: start <= nums.min() && end < // nums.max() if (!(max < start || min > end)) { // not situation 1 or 2 above long startOffset = (min > start ? min : start); // situation 4 and 5 => min > start; for (int i = (int) (startOffset - start); i < cs[0]._len; ++i) { if ((ls == null && nums.has(start + i)) || (ls != null && Arrays.binarySearch(ls, start + i) >= 0)) { for (int c = 0; c < cs.length; ++c) { if (cs[c] instanceof CStrChunk) ncs[c].addStr(cs[c], i); else if (cs[c] instanceof C16Chunk) ncs[c].addUUID(cs[c], i); else if (cs[c].isNA(i)) ncs[c].addNA(); else ncs[c].addNum(cs[c].atd(i)); } } } } } }.doAll(fr.types(), fr).outputFrame(fr.names(), fr.domains()); } else if ((asts[2] instanceof ASTNum)) { long[] rows = new long[] {(long) (((ASTNum) asts[2])._v.getNum())}; returningFrame = fr.deepSlice(rows, null); } else if ((asts[2] instanceof ASTExec) || (asts[2] instanceof ASTId)) { Frame predVec = stk.track(asts[2].exec(env)).getFrame(); if (predVec.numCols() != 1) throw new IllegalArgumentException( "Conditional Row Slicing Expression evaluated to " + predVec.numCols() + " columns. Must be a boolean Vec."); returningFrame = fr.deepSlice(predVec, null); } else throw new IllegalArgumentException( "Row slicing requires a number-list as the last argument, but found a " + asts[2].getClass()); return new ValFrame(returningFrame); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Execute all args. Find a canonical frame; all Frames must look like this one. // Each argument turns into either a Frame (whose rows are entirely // inlined) or a scalar (which is replicated across as a single row). Frame fr = null; // Canonical Frame; all frames have the same column count, types and names int nchks = 0; // Total chunks Val vals[] = new Val[asts.length]; // Computed AST results for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { fr = vals[i].getFrame(); nchks += fr.anyVec().nChunks(); // Total chunks } else nchks++; // One chunk per scalar } // No Frame, just a pile-o-scalars? Vec zz = null; // The zero-length vec for the zero-frame frame if (fr == null) { // Zero-length, 1-column, default name fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)}); if (asts.length == 1) return new ValFrame(fr); } // Verify all Frames are the same columns, names, and types. Domains can vary, and will be the // union final Frame frs[] = new Frame[asts.length]; // Input frame final byte[] types = fr.types(); // Column types final int ncols = fr.numCols(); final long[] espc = new long[nchks + 1]; // Compute a new layout! int coffset = 0; for (int i = 1; i < asts.length; i++) { Val val = vals[i]; // Save values computed for pass 2 Frame fr0 = val.isFrame() ? val.getFrame() // Scalar: auto-expand into a 1-row frame : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols()))); // Check that all frames are compatible if (fr.numCols() != fr0.numCols()) throw new IllegalArgumentException( "rbind frames must have all the same columns, found " + fr.numCols() + " and " + fr0.numCols() + " columns."); if (!Arrays.deepEquals(fr._names, fr0._names)) throw new IllegalArgumentException( "rbind frames must have all the same column names, found " + Arrays.toString(fr._names) + " and " + Arrays.toString(fr0._names)); if (!Arrays.equals(types, fr0.types())) throw new IllegalArgumentException( "rbind frames must have all the same column types, found " + Arrays.toString(types) + " and " + Arrays.toString(fr0.types())); frs[i] = fr0; // Save frame // Roll up the ESPC row counts long roffset = espc[coffset]; long[] espc2 = fr0.anyVec().espc(); for (int j = 1; j < espc2.length; j++) // Roll up the row counts espc[coffset + j] = (roffset + espc2[j]); coffset += espc2.length - 1; // Chunk offset } if (zz != null) zz.remove(); // build up the new domains for each vec HashMap<String, Integer>[] dmap = new HashMap[types.length]; String[][] domains = new String[types.length][]; int[][][] cmaps = new int[types.length][][]; for (int k = 0; k < types.length; ++k) { dmap[k] = new HashMap<>(); int c = 0; byte t = types[k]; if (t == Vec.T_CAT) { int[][] maps = new int[frs.length][]; for (int i = 1; i < frs.length; i++) { maps[i] = new int[frs[i].vec(k).domain().length]; for (int j = 0; j < maps[i].length; j++) { String s = frs[i].vec(k).domain()[j]; if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++); else maps[i][j] = dmap[k].get(s); } } cmaps[k] = maps; } else { cmaps[k] = new int[frs.length][]; } domains[k] = c == 0 ? null : new String[c]; for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey(); } // Now make Keys for the new Vecs Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols()); Vec[] vecs = new Vec[fr.numCols()]; int rowLayout = Vec.ESPC.rowLayout(keys[0], espc); for (int i = 0; i < vecs.length; i++) vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]); // Do the row-binds column-by-column. // Switch to F/J thread for continuations ParallelRbinds t; H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join(); return new ValFrame(new Frame(fr.names(), t._vecs)); }
// internal version with repeat counter // currently hardcoded to do up to 10 tries to get a row from each class, which can be impossible // for certain wrong sampling ratios private static Frame sampleFrameStratified( final Frame fr, Vec label, final float[] sampling_ratios, final long seed, final boolean debug, int count) { if (fr == null) return null; assert (label.isEnum()); assert (sampling_ratios != null && sampling_ratios.length == label.domain().length); final int labelidx = fr.find(label); // which column is the label? assert (labelidx >= 0); final boolean poisson = false; // beta feature Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { final Random rng = getDeterRNG(seed + cs[0].cidx()); for (int r = 0; r < cs[0]._len; r++) { if (cs[labelidx].isNA0(r)) continue; // skip missing labels final int label = (int) cs[labelidx].at80(r); assert (sampling_ratios.length > label && label >= 0); int sampling_reps; if (poisson) { sampling_reps = Utils.getPoisson(sampling_ratios[label], rng); } else { final float remainder = sampling_ratios[label] - (int) sampling_ratios[label]; sampling_reps = (int) sampling_ratios[label] + (rng.nextFloat() < remainder ? 1 : 0); } for (int i = 0; i < ncs.length; i++) { for (int j = 0; j < sampling_reps; ++j) { ncs[i].addNum(cs[i].at0(r)); } } } } }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains()); // Confirm the validity of the distribution long[] dist = new ClassDist(r.vecs()[labelidx]).doAll(r.vecs()[labelidx]).dist(); // if there are no training labels in the test set, then there is no point in sampling the test // set if (dist == null) return fr; if (debug) { long sumdist = Utils.sum(dist); Log.info("After stratified sampling: " + sumdist + " rows."); for (int i = 0; i < dist.length; ++i) { Log.info( "Class " + r.vecs()[labelidx].domain(i) + ": count: " + dist[i] + " sampling ratio: " + sampling_ratios[i] + " actual relative frequency: " + (float) dist[i] / sumdist * dist.length); } } // Re-try if we didn't get at least one example from each class if (Utils.minValue(dist) == 0 && count < 10) { Log.info( "Re-doing stratified sampling because not all classes were represented (unlucky draw)."); r.delete(); return sampleFrameStratified(fr, label, sampling_ratios, seed + 1, debug, ++count); } // shuffle intra-chunk Frame shuffled = shuffleFramePerChunk(r, seed + 0x580FF13); r.delete(); return shuffled; }
/** Score a frame with the given model and return the metrics AND the prediction frame. */ @SuppressWarnings("unused") // called through reflection by RequestServer public ModelMetricsListSchemaV3 predict(int version, ModelMetricsListSchemaV3 s) { // parameters checking: if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model); if (null == DKV.get(s.model.name)) throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name); if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame); if (null == DKV.get(s.frame.name)) throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name); ModelMetricsList parms = s.createAndFillImpl(); Frame predictions; if (!s.reconstruction_error && !s.reconstruction_error_per_feature && s.deep_features_hidden_layer < 0 && !s.project_archetypes && !s.reconstruct_train && !s.leaf_node_assignment) { if (null == parms._predictions_name) parms._predictions_name = "predictions" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); predictions = parms._model.score(parms._frame, parms._predictions_name); } else { if (Model.DeepFeatures.class.isAssignableFrom(parms._model.getClass())) { if (s.reconstruction_error || s.reconstruction_error_per_feature) { if (s.deep_features_hidden_layer >= 0) throw new H2OIllegalArgumentException( "Can only compute either reconstruction error OR deep features.", ""); if (null == parms._predictions_name) parms._predictions_name = "reconstruction_error" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); predictions = ((Model.DeepFeatures) parms._model) .scoreAutoEncoder( parms._frame, Key.make(parms._predictions_name), parms._reconstruction_error_per_feature); } else { if (s.deep_features_hidden_layer < 0) throw new H2OIllegalArgumentException( "Deep features hidden layer index must be >= 0.", ""); if (null == parms._predictions_name) parms._predictions_name = "deep_features" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); predictions = ((Model.DeepFeatures) parms._model) .scoreDeepFeatures(parms._frame, s.deep_features_hidden_layer); } predictions = new Frame(Key.make(parms._predictions_name), predictions.names(), predictions.vecs()); DKV.put(predictions._key, predictions); } else if (Model.GLRMArchetypes.class.isAssignableFrom(parms._model.getClass())) { if (s.project_archetypes) { if (null == parms._predictions_name) parms._predictions_name = "reconstructed_archetypes_" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_of_" + parms._frame._key.toString(); predictions = ((Model.GLRMArchetypes) parms._model) .scoreArchetypes( parms._frame, Key.make(parms._predictions_name), s.reverse_transform); } else { assert s.reconstruct_train; if (null == parms._predictions_name) parms._predictions_name = "reconstruction_" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_of_" + parms._frame._key.toString(); predictions = ((Model.GLRMArchetypes) parms._model) .scoreReconstruction( parms._frame, Key.make(parms._predictions_name), s.reverse_transform); } } else if (Model.LeafNodeAssignment.class.isAssignableFrom(parms._model.getClass())) { assert (s.leaf_node_assignment); if (null == parms._predictions_name) parms._predictions_name = "leaf_node_assignement" + Key.make().toString().substring(0, 5) + "_" + parms._model._key.toString() + "_on_" + parms._frame._key.toString(); predictions = ((Model.LeafNodeAssignment) parms._model) .scoreLeafNodeAssignment(parms._frame, Key.make(parms._predictions_name)); } else throw new H2OIllegalArgumentException( "Requires a Deep Learning, GLRM, DRF or GBM model.", "Model must implement specific methods."); } ModelMetricsListSchemaV3 mm = this.fetch(version, s); // TODO: for now only binary predictors write an MM object. // For the others cons one up here to return the predictions frame. if (null == mm) mm = new ModelMetricsListSchemaV3(); mm.predictions_frame = new KeyV3.FrameKeyV3(predictions._key); if (parms._leaf_node_assignment) // don't show metrics in leaf node assignments are made mm.model_metrics = null; if (null == mm.model_metrics || 0 == mm.model_metrics.length) { // There was no response in the test set -> cannot make a model_metrics object } else { mm.model_metrics[0].predictions = new FrameV3(predictions, 0, 100); // TODO: Should call schema(version) } return mm; }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return this; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = cols[i++] - _catOffsets[j] + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] c = new int[_cats - ignoredCnt][]; int y = 0; for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl; assert y == c.length; catLvls = c; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off]; } DataInfo dinfo = new DataInfo( _key, f, normMul, normSub, catLvls, _responses, _predictor_transform, _response_transform, _skipMissing, _imputeMissing, _weights, _offset, _fold); // do not put activeData into K/V - active data is recreated on each node based on active // columns dinfo._activeCols = cols; return dinfo; }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return deep_clone(); int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = (cols[i++] - _catOffsets[j]) + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } int[] catModes = _catModes; for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] cs = new int[_cats - ignoredCnt][]; catModes = new int[_cats - ignoredCnt]; int y = 0; for (int c = 0; c < catLvls.length; ++c) if (catLvls[c] != null) { catModes[y] = _catModes[c]; cs[y++] = catLvls[c]; } assert y == cs.length; catLvls = cs; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id - hasIcpt; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off]; } // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, // TransformType predictor_transform, TransformType response_transform, boolean skipMissing, // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes); dinfo._activeCols = cols; return dinfo; }