public StringBuilder toString(StringBuilder sb, String[] fs, long idx) { Vec vecs[] = vecs(); for (int c = 0; c < fs.length; c++) { Vec vec = vecs[c]; if (vec.isEnum()) { String s = "----------"; if (!vec.isNA(idx)) { int x = (int) vec.at8(idx); if (x >= 0 && x < vec._domain.length) s = vec._domain[x]; } sb.append(String.format(fs[c], s)); } else if (vec.isInt()) { if (vec.isNA(idx)) { Chunk C = vec.elem2BV(0); // 1st Chunk int len = C.pformat_len0(); // Printable width for (int i = 0; i < len; i++) sb.append('-'); } else { try { sb.append(String.format(fs[c], vec.at8(idx))); } catch (IllegalFormatException ife) { System.out.println("Format: " + fs[c] + " col=" + c + " not for ints"); ife.printStackTrace(); } } } else { sb.append(String.format(fs[c], vec.at(idx))); if (vec.isNA(idx)) sb.append(' '); } sb.append(' '); // Column seperator } sb.append('\n'); return sb; }
@Override public void map(Chunk[] ix, NewChunk[] ncs) { final Vec[] vecs = new Vec[_cols.length]; final Vec anyv = _base.anyVec(); final long nrow = anyv.length(); long r = ix[0].at80(0); int last_ci = anyv.elem2ChunkIdx(r < nrow ? r : 0); // memoize the last chunk index long last_c0 = anyv._espc[last_ci]; // ... last chunk start long last_c1 = anyv._espc[last_ci + 1]; // ... last chunk end Chunk[] last_cs = new Chunk[vecs.length]; // ... last chunks for (int c = 0; c < _cols.length; c++) { vecs[c] = _base.vecs()[_cols[c]]; last_cs[c] = vecs[c].elem2BV(last_ci); } for (int i = 0; i < ix[0]._len; i++) { // select one row r = ix[0].at80(i) - 1; // next row to select if (r < 0) continue; if (r >= nrow) { for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN); } else { if (r < last_c0 || r >= last_c1) { last_ci = anyv.elem2ChunkIdx(r); last_c0 = anyv._espc[last_ci]; last_c1 = anyv._espc[last_ci + 1]; for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].elem2BV(last_ci); } for (int c = 0; c < vecs.length; c++) ncs[c].addNum(last_cs[c].at(r)); } } }
public void dropInteractions() { // only called to cleanup the InteractionWrappedVecs! if (_interactions != null) { Vec[] vecs = _adaptedFrame.remove(_interactionVecs); for (Vec v : vecs) v.remove(); _interactions = null; } }
public Vec replace(int col, Vec nv) { assert col < _names.length; Vec rv = vecs()[col]; assert rv.group().equals(nv.group()); _vecs[col] = nv; _keys[col] = nv._key; if (DKV.get(nv._key) == null) // If not already in KV, put it there DKV.put(nv._key, nv); return rv; }
/** * Check that the vectors are all compatible. All Vecs have their content sharded using same * number of rows per chunk. */ public void checkCompatible() { try { Vec v0 = anyVec(); int nchunks = v0.nChunks(); for (Vec vec : vecs()) { if (vec instanceof AppendableVec) continue; // New Vectors are endlessly compatible if (vec.nChunks() != nchunks) throw new IllegalArgumentException( "Vectors different numbers of chunks, " + nchunks + " and " + vec.nChunks()); } // Also check each chunk has same rows for (int i = 0; i < nchunks; i++) { long es = v0.chunk2StartElem(i); for (Vec vec : vecs()) if (!(vec instanceof AppendableVec) && vec.chunk2StartElem(i) != es) throw new IllegalArgumentException( "Vector chunks different numbers of rows, " + es + " and " + vec.chunk2StartElem(i)); } } catch (Throwable ex) { Throwables.propagate(ex); } }
@Override protected void setupLocal() { // Precompute the first input chunk index and start row inside that chunk for this partition Vec anyInVec = _srcVecs[0]; long[] partSizes = Utils.partitione(anyInVec.length(), _ratios); long pnrows = 0; for (int p = 0; p < _partIdx; p++) pnrows += partSizes[p]; long[] espc = anyInVec._espc; while (_pcidx < espc.length - 1 && (pnrows -= (espc[_pcidx + 1] - espc[_pcidx])) > 0) _pcidx++; assert pnrows <= 0; _psrow = (int) (pnrows + espc[_pcidx + 1] - espc[_pcidx]); }
// Make vector templates for all output frame vectors private Vec[][] makeTemplates(Frame dataset, float[] ratios) { Vec anyVec = dataset.anyVec(); final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios); final int num = dataset.numCols(); // number of columns in input frame final int nsplits = espcPerSplit.length; // number of splits final String[][] domains = dataset.domains(); // domains Vec[][] t = new Vec[nsplits][ /*num*/]; // resulting vectors for all for (int i = 0; i < nsplits; i++) { // vectors for j-th split t[i] = new Vec(Vec.newKey(), espcPerSplit[i /*-th split*/]).makeZeros(num, domains); } return t; }
public Frame(String[] names, Vec[] vecs) { // assert names==null || names.length == vecs.length : "Number of columns does not match to // number of cols' names."; _names = names; _vecs = vecs; _keys = new Key[vecs.length]; for (int i = 0; i < vecs.length; i++) { Key k = _keys[i] = vecs[i]._key; if (DKV.get(k) == null) // If not already in KV, put it there DKV.put(k, vecs[i]); } Vec v0 = anyVec(); if (v0 == null) return; VectorGroup grp = v0.group(); for (int i = 0; i < vecs.length; i++) assert grp.equals(vecs[i].group()); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Compute the variable args. Find the common row count Val vals[] = new Val[asts.length]; Vec vec = null; for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { Vec anyvec = vals[i].getFrame().anyVec(); if (anyvec == null) continue; // Ignore the empty frame if (vec == null) vec = anyvec; else if (vec.length() != anyvec.length()) throw new IllegalArgumentException( "cbind frames must have all the same rows, found " + vec.length() + " and " + anyvec.length() + " rows."); } } boolean clean = false; if (vec == null) { vec = Vec.makeZero(1); clean = true; } // Default to length 1 // Populate the new Frame Frame fr = new Frame(); for (int i = 1; i < asts.length; i++) { switch (vals[i].type()) { case Val.FRM: fr.add(fr.makeCompatible(vals[i].getFrame())); break; case Val.FUN: throw H2O.unimpl(); case Val.STR: throw H2O.unimpl(); case Val.NUM: // Auto-expand scalars to fill every row double d = vals[i].getNum(); fr.add(Double.toString(d), vec.makeCon(d)); break; default: throw H2O.unimpl(); } } if (clean) vec.remove(); return new ValFrame(fr); }
/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
@Override public String toString() { // Across String s = "{" + _names[0]; long bs = _vecs[0].byteSize(); for (int i = 1; i < _names.length; i++) { s += "," + _names[i]; bs += _vecs[i].byteSize(); } s += "}, " + PrettyPrint.bytes(bs) + "\n"; // Down Vec v0 = firstReadable(); if (v0 == null) return s; int nc = v0.nChunks(); s += "Chunk starts: {"; for (int i = 0; i < nc; i++) s += v0.elem2BV(i)._start + ","; s += "}"; return s; }
/** Appends a named column, keeping the last Vec as the response */ public void add(String name, Vec vec) { assert _vecs.length == 0 || anyVec().group().equals(vec.group()); final int len = _names.length; _names = Arrays.copyOf(_names, len + 1); _vecs = Arrays.copyOf(_vecs, len + 1); _keys = Arrays.copyOf(_keys, len + 1); _names[len] = name; _vecs[len] = vec; _keys[len] = vec._key; }
@Override ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); double frac = asts[2].exec(env).getNum(); double nrow = fr.numRows() * frac; Vec vecs[] = fr.vecs(); long[] idxs = new long[fr.numCols()]; int j = 0; for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i; Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec()); return new ValFrame(new Frame(vec)); }
/** * Compute the L2 norm for each row of the frame * * @param fr Input frame * @return Vec containing L2 values for each row, is in K-V store */ public static Vec getL2(final Frame fr, final double[] scale) { // add workspace vec at end final int idx = fr.numCols(); assert (scale.length == idx) : "Mismatch for number of columns"; fr.add("L2", fr.anyVec().makeZero()); Vec res; try { new MRTask2() { @Override public void map(Chunk[] cs) { for (int r = 0; r < cs[0]._len; r++) { double norm2 = 0; for (int i = 0; i < idx; i++) norm2 += Math.pow(cs[i].at0(r) * scale[i], 2); cs[idx].set0(r, Math.sqrt(norm2)); } } }.doAll(fr); } finally { res = fr.remove(idx); } res.rollupStats(); return res; }
@Override public void map(Chunk cs) { int idx = _chunkOffset + cs.cidx(); Key ckey = Vec.chunkKey(_v._key, idx); if (_cmap != null) { assert !cs.hasFloat() : "Input chunk (" + cs.getClass() + ") has float, but is expected to be categorical"; NewChunk nc = new NewChunk(_v, idx); // loop over rows and update ints for new domain mapping according to vecs[c].domain() for (int r = 0; r < cs._len; ++r) { if (cs.isNA(r)) nc.addNA(); else nc.addNum(_cmap[(int) cs.at8(r)], 0); } nc.close(_fs); } else { DKV.put(ckey, cs.deepCopy(), _fs, true); } }
public static Key makeByteVec(Key k, String... data) { byte[][] chunks = new byte[data.length][]; long[] espc = new long[data.length + 1]; for (int i = 0; i < chunks.length; ++i) { chunks[i] = data[i].getBytes(); espc[i + 1] = espc[i] + data[i].length(); } Futures fs = new Futures(); Key key = Vec.newKey(); ByteVec bv = new ByteVec(key, Vec.ESPC.rowLayout(key, espc)); for (int i = 0; i < chunks.length; ++i) { Key chunkKey = bv.chunkKey(i); DKV.put( chunkKey, new Value(chunkKey, chunks[i].length, chunks[i], TypeMap.C1NCHUNK, Value.ICE), fs); } DKV.put(bv._key, bv, fs); Frame fr = new Frame(k, new String[] {"makeByteVec"}, new Vec[] {bv}); DKV.put(k, fr, fs); fs.blockForPending(); return k; }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Execute all args. Find a canonical frame; all Frames must look like this one. // Each argument turns into either a Frame (whose rows are entirely // inlined) or a scalar (which is replicated across as a single row). Frame fr = null; // Canonical Frame; all frames have the same column count, types and names int nchks = 0; // Total chunks Val vals[] = new Val[asts.length]; // Computed AST results for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { fr = vals[i].getFrame(); nchks += fr.anyVec().nChunks(); // Total chunks } else nchks++; // One chunk per scalar } // No Frame, just a pile-o-scalars? Vec zz = null; // The zero-length vec for the zero-frame frame if (fr == null) { // Zero-length, 1-column, default name fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)}); if (asts.length == 1) return new ValFrame(fr); } // Verify all Frames are the same columns, names, and types. Domains can vary, and will be the // union final Frame frs[] = new Frame[asts.length]; // Input frame final byte[] types = fr.types(); // Column types final int ncols = fr.numCols(); final long[] espc = new long[nchks + 1]; // Compute a new layout! int coffset = 0; for (int i = 1; i < asts.length; i++) { Val val = vals[i]; // Save values computed for pass 2 Frame fr0 = val.isFrame() ? val.getFrame() // Scalar: auto-expand into a 1-row frame : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols()))); // Check that all frames are compatible if (fr.numCols() != fr0.numCols()) throw new IllegalArgumentException( "rbind frames must have all the same columns, found " + fr.numCols() + " and " + fr0.numCols() + " columns."); if (!Arrays.deepEquals(fr._names, fr0._names)) throw new IllegalArgumentException( "rbind frames must have all the same column names, found " + Arrays.toString(fr._names) + " and " + Arrays.toString(fr0._names)); if (!Arrays.equals(types, fr0.types())) throw new IllegalArgumentException( "rbind frames must have all the same column types, found " + Arrays.toString(types) + " and " + Arrays.toString(fr0.types())); frs[i] = fr0; // Save frame // Roll up the ESPC row counts long roffset = espc[coffset]; long[] espc2 = fr0.anyVec().espc(); for (int j = 1; j < espc2.length; j++) // Roll up the row counts espc[coffset + j] = (roffset + espc2[j]); coffset += espc2.length - 1; // Chunk offset } if (zz != null) zz.remove(); // build up the new domains for each vec HashMap<String, Integer>[] dmap = new HashMap[types.length]; String[][] domains = new String[types.length][]; int[][][] cmaps = new int[types.length][][]; for (int k = 0; k < types.length; ++k) { dmap[k] = new HashMap<>(); int c = 0; byte t = types[k]; if (t == Vec.T_CAT) { int[][] maps = new int[frs.length][]; for (int i = 1; i < frs.length; i++) { maps[i] = new int[frs[i].vec(k).domain().length]; for (int j = 0; j < maps[i].length; j++) { String s = frs[i].vec(k).domain()[j]; if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++); else maps[i][j] = dmap[k].get(s); } } cmaps[k] = maps; } else { cmaps[k] = new int[frs.length][]; } domains[k] = c == 0 ? null : new String[c]; for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey(); } // Now make Keys for the new Vecs Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols()); Vec[] vecs = new Vec[fr.numCols()]; int rowLayout = Vec.ESPC.rowLayout(keys[0], espc); for (int i = 0; i < vecs.length; i++) vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]); // Do the row-binds column-by-column. // Switch to F/J thread for continuations ParallelRbinds t; H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join(); return new ValFrame(new Frame(fr.names(), t._vecs)); }
/** * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality * greatest to least) with all numerical columns following. The response column(s) are placed at * the end. * * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum * * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo * should not have to care how these interactions are generated. Any heuristic using the fullN * value should continue functioning the same. * * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a * list of pairs of column indices with limited enums. */ public DataInfo( Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold, Model.InteractionPair[] interactions) { super(Key.<DataInfo>make()); _valid = valid != null; assert predictor_transform != null; assert response_transform != null; _offset = offset; _weights = weight; _fold = fold; assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true"; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _interactions = interactions; // create dummy InteractionWrappedVecs and shove them onto the front if (_interactions != null) { _interactionVecs = new int[_interactions.length]; train = Model.makeInteractions( train, false, _interactions, _useAllFactorLevels, _skipMissing, predictor_transform == TransformType.STANDARDIZE) .add(train); if (valid != null) valid = Model.makeInteractions( valid, true, _interactions, _useAllFactorLevels, _skipMissing, predictor_transform == TransformType.STANDARDIZE) .add(valid); // FIXME: should be using the training subs/muls! } _permutation = new int[train.numCols()]; final Vec[] tvecs = train.vecs(); // Count categorical-vs-numerical final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0); int[] nums = MemoryManager.malloc4(n); int[] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for (int i = 0; i < n; ++i) if (tvecs[i].isCategorical()) cats[ncats++] = i; else nums[nnums++] = i; _nums = nnums; _cats = ncats; _catLvls = new int[ncats][]; // sort the cats in the decreasing order according to their size for (int i = 0; i < ncats; ++i) for (int j = i + 1; j < ncats; ++j) if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } String[] names = new String[train.numCols()]; Vec[] tvecs2 = new Vec[train.numCols()]; // Compute the cardinality of each cat _catModes = new int[ncats]; _catOffsets = MemoryManager.malloc4(ncats + 1); _catMissing = new boolean[ncats]; int len = _catOffsets[0] = 0; int interactionIdx = 0; // simple index into the _interactionVecs array ArrayList<Integer> interactionIds; if (_interactions == null) { interactionIds = new ArrayList<>(); for (int i = 0; i < tvecs.length; ++i) if (tvecs[i] instanceof InteractionWrappedVec) { interactionIds.add(i); } _interactionVecs = new int[interactionIds.size()]; for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i); } for (int i = 0; i < ncats; ++i) { names[i] = train._names[cats[i]]; Vec v = (tvecs2[i] = tvecs[cats[i]]); _catMissing[i] = missingBucket; // needed for test time if (v instanceof InteractionWrappedVec) { if (_interactions != null) _interactions[interactionIdx].vecIdx = i; _interactionVecs[interactionIdx++] = i; // i (and not cats[i]) because this is the index in _adaptedFrame _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0)); } else _catOffsets[i + 1] = (len += v.domain().length - (useAllFactorLevels ? 0 : 1) + (missingBucket ? 1 : 0)); // missing values turn into a new factor level _catModes[i] = imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100; _permutation[i] = cats[i]; } _numMeans = new double[nnums]; _numOffsets = MemoryManager.malloc4(nnums + 1); _numOffsets[0] = len; boolean isIWV; // is InteractionWrappedVec? for (int i = 0; i < nnums; ++i) { names[i + ncats] = train._names[nums[i]]; Vec v = train.vec(nums[i]); tvecs2[i + ncats] = v; isIWV = v instanceof InteractionWrappedVec; if (isIWV) { if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats; _interactionVecs[interactionIdx++] = i + ncats; } _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1)); _numMeans[i] = train.vec(nums[i]).mean(); _permutation[i + ncats] = nums[i]; } for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0); i < names.length; ++i) { names[i] = train._names[i]; tvecs2[i] = train.vec(i); } _adaptedFrame = new Frame(names, tvecs2); train.restructure(names, tvecs2); if (valid != null) valid.restructure(names, valid.vecs(names)); // _adaptedFrame = train; setPredictorTransform(predictor_transform); if (_responses > 0) setResponseTransform(response_transform); }
public static int imputeCat(Vec v) { if (v.isCategorical()) return v.mode(); return (int) Math.round(v.mean()); }
public Frame deepSlice(Object orows, Object ocols) { // ocols is either a long[] or a Frame-of-1-Vec long[] cols; if (ocols == null) { cols = (long[]) ocols; assert cols == null; } else { if (ocols instanceof long[]) { cols = (long[]) ocols; } else if (ocols instanceof Frame) { Frame fr = (Frame) ocols; if (fr.numCols() != 1) { throw new IllegalArgumentException( "Columns Frame must have only one column (actually has " + fr.numCols() + " columns)"); } long n = fr.anyVec().length(); if (n > MAX_EQ2_COLS) { throw new IllegalArgumentException( "Too many requested columns (requested " + n + ", max " + MAX_EQ2_COLS + ")"); } cols = new long[(int) n]; Vec v = fr._vecs[0]; for (long i = 0; i < v.length(); i++) { cols[(int) i] = v.at8(i); } } else { throw new IllegalArgumentException( "Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")"); } } // Since cols is probably short convert to a positive list. int c2[] = null; if (cols == null) { c2 = new int[numCols()]; for (int i = 0; i < c2.length; i++) c2[i] = i; } else if (cols.length == 0) { c2 = new int[0]; } else if (cols[0] > 0) { c2 = new int[cols.length]; for (int i = 0; i < cols.length; i++) c2[i] = (int) cols[i] - 1; // Convert 1-based cols to zero-based } else { c2 = new int[numCols() - cols.length]; int j = 0; for (int i = 0; i < numCols(); i++) { if (j >= cols.length || i < (-cols[j] - 1)) c2[i - j] = i; else j++; } } for (int i = 0; i < c2.length; i++) if (c2[i] >= numCols()) throw new IllegalArgumentException( "Trying to select column " + c2[i] + " but only " + numCols() + " present."); if (c2.length == 0) throw new IllegalArgumentException( "No columns selected (did you try to select column 0 instead of column 1?)"); // Do Da Slice // orows is either a long[] or a Vec if (orows == null) return new DeepSlice((long[]) orows, c2) .doAll(c2.length, this) .outputFrame(names(c2), domains(c2)); else if (orows instanceof long[]) { final long CHK_ROWS = 1000000; long[] rows = (long[]) orows; if (rows.length == 0) return new DeepSlice(rows, c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2)); if (rows[0] < 0) return new DeepSlice(rows, c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2)); // Vec'ize the index array AppendableVec av = new AppendableVec("rownames"); int r = 0; int c = 0; while (r < rows.length) { NewChunk nc = new NewChunk(av, c); long end = Math.min(r + CHK_ROWS, rows.length); for (; r < end; r++) { nc.addNum(rows[r]); } nc.close(c++, null); } Vec c0 = av.close(null); // c0 is the row index vec Frame fr2 = new Slice(c2, this) .doAll(c2.length, new Frame(new String[] {"rownames"}, new Vec[] {c0})) .outputFrame(names(c2), domains(c2)); UKV.remove(c0._key); // Remove hidden vector return fr2; } Frame frows = (Frame) orows; Vec vrows = frows.anyVec(); // It's a compatible Vec; use it as boolean selector. // Build column names for the result. Vec[] vecs = new Vec[c2.length + 1]; String[] names = new String[c2.length + 1]; for (int i = 0; i < c2.length; ++i) { vecs[i] = _vecs[c2[i]]; names[i] = _names[c2[i]]; } vecs[c2.length] = vrows; names[c2.length] = "predicate"; return new DeepSelect() .doAll(c2.length, new Frame(names, vecs)) .outputFrame(names(c2), domains(c2)); }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { int idx = 0; // idx!=i when interactions are in play, otherwise, it's just 'i' for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); boolean isIWV = isInteractionVec(vecStart + i); switch (t) { case STANDARDIZE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case NORMALIZE: normMul[idx] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DEMEAN: normMul[idx] = 1; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DESCALE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[idx]); assert !Double.isNaN(normSub[idx]); idx = isIWV ? (idx + nextNumericIdx(i)) : (idx + 1); } }
public ClassDist(final Vec label) { super(label.domain().length); }
/** Returns the first readable vector. */ public Vec anyVec() { if (_col0 != null) return _col0; for (Vec v : vecs()) if (v.readable()) return (_col0 = v); return null; }
/** * Stratified sampling for classifiers * * @param fr Input frame * @param label Label vector (must be enum) * @param sampling_ratios Optional: array containing the requested sampling ratios per class (in * order of domains), will be overwritten if it contains all 0s * @param maxrows Maximum number of rows in the returned frame * @param seed RNG seed for sampling * @param allowOversampling Allow oversampling of minority classes * @param verbose Whether to print verbose info * @return Sampled frame, with approximately the same number of samples from each class (or given * by the requested sampling ratios) */ public static Frame sampleFrameStratified( final Frame fr, Vec label, float[] sampling_ratios, long maxrows, final long seed, final boolean allowOversampling, final boolean verbose) { if (fr == null) return null; assert (label.isEnum()); assert (maxrows >= label.domain().length); long[] dist = new ClassDist(label).doAll(label).dist(); assert (dist.length > 0); Log.info( "Doing stratified sampling for data set containing " + fr.numRows() + " rows from " + dist.length + " classes. Oversampling: " + (allowOversampling ? "on" : "off")); if (verbose) { for (int i = 0; i < dist.length; ++i) { Log.info( "Class " + label.domain(i) + ": count: " + dist[i] + " prior: " + (float) dist[i] / fr.numRows()); } } // create sampling_ratios for class balance with max. maxrows rows (fill existing array if not // null) if (sampling_ratios == null || (Utils.minValue(sampling_ratios) == 0 && Utils.maxValue(sampling_ratios) == 0)) { // compute sampling ratios to achieve class balance if (sampling_ratios == null) { sampling_ratios = new float[dist.length]; } assert (sampling_ratios.length == dist.length); for (int i = 0; i < dist.length; ++i) { sampling_ratios[i] = ((float) fr.numRows() / label.domain().length) / dist[i]; // prior^-1 / num_classes } final float inv_scale = Utils.minValue( sampling_ratios); // majority class has lowest required oversampling factor to achieve // balance if (!Float.isNaN(inv_scale) && !Float.isInfinite(inv_scale)) Utils.div( sampling_ratios, inv_scale); // want sampling_ratio 1.0 for majority class (no downsampling) } if (!allowOversampling) { for (int i = 0; i < sampling_ratios.length; ++i) { sampling_ratios[i] = Math.min(1.0f, sampling_ratios[i]); } } // given these sampling ratios, and the original class distribution, this is the expected number // of resulting rows float numrows = 0; for (int i = 0; i < sampling_ratios.length; ++i) { numrows += sampling_ratios[i] * dist[i]; } final long actualnumrows = Math.min(maxrows, Math.round(numrows)); // cap #rows at maxrows assert (actualnumrows >= 0); // can have no matching rows in case of sparse data where we had to fill in a // makeZero() vector Log.info("Stratified sampling to a total of " + String.format("%,d", actualnumrows) + " rows."); if (actualnumrows != numrows) { Utils.mult( sampling_ratios, (float) actualnumrows / numrows); // adjust the sampling_ratios by the global rescaling factor if (verbose) Log.info( "Downsampling majority class by " + (float) actualnumrows / numrows + " to limit number of rows to " + String.format("%,d", maxrows)); } Log.info( "Majority class (" + label.domain()[Utils.minIndex(sampling_ratios)].toString() + ") sampling ratio: " + Utils.minValue(sampling_ratios)); Log.info( "Minority class (" + label.domain()[Utils.maxIndex(sampling_ratios)].toString() + ") sampling ratio: " + Utils.maxValue(sampling_ratios)); return sampleFrameStratified(fr, label, sampling_ratios, seed, verbose); }
public int find(Vec vec) { for (int i = 0; i < _vecs.length; i++) if (vec.equals(_vecs[i])) return i; return -1; }
/** Returns the first readable vector. */ public Vec firstReadable() { if (_col0 != null) return _col0; for (Vec v : _vecs) if (v != null && v.readable()) return (_col0 = v); return null; }
// internal version with repeat counter // currently hardcoded to do up to 10 tries to get a row from each class, which can be impossible // for certain wrong sampling ratios private static Frame sampleFrameStratified( final Frame fr, Vec label, final float[] sampling_ratios, final long seed, final boolean debug, int count) { if (fr == null) return null; assert (label.isEnum()); assert (sampling_ratios != null && sampling_ratios.length == label.domain().length); final int labelidx = fr.find(label); // which column is the label? assert (labelidx >= 0); final boolean poisson = false; // beta feature Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { final Random rng = getDeterRNG(seed + cs[0].cidx()); for (int r = 0; r < cs[0]._len; r++) { if (cs[labelidx].isNA0(r)) continue; // skip missing labels final int label = (int) cs[labelidx].at80(r); assert (sampling_ratios.length > label && label >= 0); int sampling_reps; if (poisson) { sampling_reps = Utils.getPoisson(sampling_ratios[label], rng); } else { final float remainder = sampling_ratios[label] - (int) sampling_ratios[label]; sampling_reps = (int) sampling_ratios[label] + (rng.nextFloat() < remainder ? 1 : 0); } for (int i = 0; i < ncs.length; i++) { for (int j = 0; j < sampling_reps; ++j) { ncs[i].addNum(cs[i].at0(r)); } } } } }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains()); // Confirm the validity of the distribution long[] dist = new ClassDist(r.vecs()[labelidx]).doAll(r.vecs()[labelidx]).dist(); // if there are no training labels in the test set, then there is no point in sampling the test // set if (dist == null) return fr; if (debug) { long sumdist = Utils.sum(dist); Log.info("After stratified sampling: " + sumdist + " rows."); for (int i = 0; i < dist.length; ++i) { Log.info( "Class " + r.vecs()[labelidx].domain(i) + ": count: " + dist[i] + " sampling ratio: " + sampling_ratios[i] + " actual relative frequency: " + (float) dist[i] / sumdist * dist.length); } } // Re-try if we didn't get at least one example from each class if (Utils.minValue(dist) == 0 && count < 10) { Log.info( "Re-doing stratified sampling because not all classes were represented (unlucky draw)."); r.delete(); return sampleFrameStratified(fr, label, sampling_ratios, seed + 1, debug, ++count); } // shuffle intra-chunk Frame shuffled = shuffleFramePerChunk(r, seed + 0x580FF13); r.delete(); return shuffled; }