public StringBuilder toString(StringBuilder sb, String[] fs, long idx) { Vec vecs[] = vecs(); for (int c = 0; c < fs.length; c++) { Vec vec = vecs[c]; if (vec.isEnum()) { String s = "----------"; if (!vec.isNA(idx)) { int x = (int) vec.at8(idx); if (x >= 0 && x < vec._domain.length) s = vec._domain[x]; } sb.append(String.format(fs[c], s)); } else if (vec.isInt()) { if (vec.isNA(idx)) { Chunk C = vec.elem2BV(0); // 1st Chunk int len = C.pformat_len0(); // Printable width for (int i = 0; i < len; i++) sb.append('-'); } else { try { sb.append(String.format(fs[c], vec.at8(idx))); } catch (IllegalFormatException ife) { System.out.println("Format: " + fs[c] + " col=" + c + " not for ints"); ife.printStackTrace(); } } } else { sb.append(String.format(fs[c], vec.at(idx))); if (vec.isNA(idx)) sb.append(' '); } sb.append(' '); // Column seperator } sb.append('\n'); return sb; }
@Override public void map(Chunk chks[], NewChunk nchks[]) { long rstart = chks[0]._start; int rlen = chks[0]._len; // Total row count int rx = 0; // Which row to in/ex-clude int rlo = 0; // Lo/Hi for this block of rows int rhi = rlen; while (true) { // Still got rows to include? if (_rows != null) { // Got a row selector? if (rx >= _rows.length) break; // All done with row selections long r = _rows[rx++] - 1; // Next row selector if (r < 0) { // Row exclusion if (rx > 0 && _rows[rx - 1] < _rows[rx]) throw H2O.unimpl(); long er = Math.abs(r) - 2; if (er < rstart) continue; // scoop up all of the rows before the first exclusion if (rx == 1 && ((int) (er + 1 - rstart)) > 0 && _ex) { rlo = (int) rstart; rhi = (int) (er - rstart); _ex = false; rx--; } else { rlo = (int) (er + 1 - rstart); // TODO: handle jumbled row indices ( e.g. -c(1,5,3) ) while (rx < _rows.length && (_rows[rx] + 1 == _rows[rx - 1] && rlo < rlen)) { if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl(); rx++; rlo++; // Exclude consecutive rows } rhi = rx >= _rows.length ? rlen : (int) Math.abs(_rows[rx] - 1) - 2; if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl(); } } else { // Positive row list? if (r < rstart) continue; rlo = (int) (r - rstart); rhi = rlo + 1; // Stop at the next row while (rx < _rows.length && (_rows[rx] - 1 - rstart) == rhi && rhi < rlen) { rx++; rhi++; // Grab sequential rows } } } // Process this next set of rows // For all cols in the new set for (int i = 0; i < _cols.length; i++) { Chunk oc = chks[_cols[i]]; NewChunk nc = nchks[i]; if (oc._vec.isInt()) { // Slice on integer columns for (int j = rlo; j < rhi; j++) if (oc.isNA0(j)) nc.addNA(); else nc.addNum(oc.at80(j), 0); } else { // Slice on double columns for (int j = rlo; j < rhi; j++) nc.addNum(oc.at0(j)); } } rlo = rhi; if (_rows == null) break; } }
@Override public void map(Chunk chks[], NewChunk nchks[]) { Chunk pred = chks[chks.length - 1]; for (int i = 0; i < pred._len; ++i) { if (pred.at0(i) != 0) for (int j = 0; j < chks.length - 1; ++j) nchks[j].addNum(chks[j].at0(i)); } }
/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
// Print fixed-width row & fixed-width headers (more compressed print // format). Returns the column formats. public String[] toStringHdr(StringBuilder sb) { String[] fs = new String[numCols()]; for (int c = 0; c < fs.length; c++) { String n = (c < _names.length) ? _names[c] : ("C" + c); if (numRows() == 0) { sb.append(n).append(' '); continue; } int w = 0; if (_vecs[c].isEnum()) { String ss[] = _vecs[c]._domain; for (int i = 0; i < ss.length; i++) w = Math.max(w, ss[i].length()); w = Math.min(w, 10); fs[c] = "%" + w + "." + w + "s"; } else { Chunk C = _vecs[c].elem2BV(0); // 1st Chunk String f = fs[c] = C.pformat(); // Printable width for (int x = 0; x < f.length(); x++) // Get printable width from format if (Character.isDigit(f.charAt(x))) w = w * 10 + (f.charAt(x) - '0'); else if (w > 0) break; if (f.charAt(1) == ' ') w++; // Leading blank is not in print-width } int len = sb.length(); if (n.length() <= w) { // Short name, big digits sb.append(n); for (int i = n.length(); i < w; i++) sb.append(' '); } else if (w == 1) { // First char only sb.append(n.charAt(0)); } else if (w == 2) { // First 2 chars only sb.append(n.charAt(0)).append(n.charAt(1)); } else { // First char dot lastchars; e.g. Compress "Interval" to "I.val" sb.append(n.charAt(0)).append('.'); for (int i = n.length() - (w - 2); i < n.length(); i++) sb.append(n.charAt(i)); } assert len + w == sb.length(); sb.append(' '); // Column seperator } sb.append('\n'); return fs; }
@Override public void map(Chunk cs) { int idx = _chunkOffset + cs.cidx(); Key ckey = Vec.chunkKey(_v._key, idx); if (_cmap != null) { assert !cs.hasFloat() : "Input chunk (" + cs.getClass() + ") has float, but is expected to be categorical"; NewChunk nc = new NewChunk(_v, idx); // loop over rows and update ints for new domain mapping according to vecs[c].domain() for (int r = 0; r < cs._len; ++r) { if (cs.isNA(r)) nc.addNA(); else nc.addNum(_cmap[(int) cs.at8(r)], 0); } nc.close(_fs); } else { DKV.put(ckey, cs.deepCopy(), _fs, true); } }
@Override public void map(Chunk ys) { _ys = new long[_nclass]; for (int i = 0; i < ys._len; i++) if (!ys.isNA0(i)) _ys[(int) ys.at80(i)]++; }
/** * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * * @param chunks - chunk of dataset * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk[] chunks) { Row[] rows = new Row[chunks[0]._len]; long startOff = chunks[0].start(); for (int i = 0; i < rows.length; ++i) { rows[i] = new Row( true, Math.min(_nums, 16), _cats, _responses, i, startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not // use numNums() rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } if (_skipMissing) { int N = _cats + _nums; for (int c = 0; c < N; ++c) if (chunks[c].isNA(i)) rows[i].bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; int cid = getCategoricalId(i, chunks[i].isNA(r) ? _catModes[i] : (int) chunks[i].at8(r)); if (cid >= 0) row.binIds[row.nBins++] = cid; } } // generic numbers + interactions int interactionOffset = 0; for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; if (c instanceof InteractionWrappedVec .InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot' // all other values are off (i.e., are 0) for (int r = 0; r < c._len; ++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has // one, and only one, value) Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; int cidVirtualOffset = getInteractionOffset( chunks, _cats + cid, r); // the "virtual" offset into the hot-expanded interaction row.addNum( _numOffsets[cid] + cidVirtualOffset, c.atd(r)); // FIXME: if this produces a "true" NA then should sub with mean? with? } interactionOffset += nextNumericIdx(cid); } else { for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r; Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (Double.isNaN(d)) d = _numMeans[cid]; if (_normMul != null) d *= _normMul[interactionOffset]; row.addNum(_numOffsets[cid], d); } interactionOffset++; } } // response(s) for (int i = 1; i <= _responses; ++i) { int rid = responseChunkId(i - 1); Chunk rChunk = chunks[rid]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[i - 1] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }
public Rows rows(Chunk[] chks) { int cnt = 0; for (Chunk c : chks) if (c.isSparseZero()) ++cnt; return rows(chks, cnt > (chks.length >> 1)); }