/** * Extract (dense) rows from given chunks, one Vec at a time - should be slightly faster than * per-row * * @param chunks - chunk of dataset * @return array of dense rows */ public final Row[] extractDenseRowsVertical(Chunk[] chunks) { Row[] rows = new Row[chunks[0]._len]; for (int i = 0; i < rows.length; ++i) { rows[i] = new Row(false, _nums, _cats, _responses, 0); rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } } for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; if (chunks[i].isNA(r)) { if (_skipMissing) { row.bad = true; } else row.binIds[row.nBins++] = _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor } else { int c = getCategoricalId(i, (int) chunks[i].at8(r)); if (c >= 0) row.binIds[row.nBins++] = c; } } } int numStart = numStart(); // generic numbers for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; for (int r = 0; r < c._len; ++r) { Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (_normMul != null && _normSub != null) // either none or both d = (d - _normSub[cid]) * _normMul[cid]; row.numVals[numStart + cid] = d; } } // response(s) for (int i = 1; i <= _responses; ++i) { Chunk rChunk = chunks[responseChunkId()]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[row.response.length - i] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }
@Override public void map(Chunk response, Chunk weight, Chunk offset) { for (int i = 0; i < response._len; ++i) { if (response.isNA(i)) continue; double w = weight.atd(i); if (w == 0) continue; double y = response.atd(i); double o = offset.atd(i); _num += _dist.initFNum(w, o, y); _denom += _dist.initFDenom(w, o); } }
@Override public void map(Chunk c, Chunk w) { for (int i = 0; i < c.len(); ++i) if (!c.isNA(i)) { double wt = w.atd(i); // For now: let the user give small weights, results are probably not very good // (same as for wtd.quantile in R) // if (wt > 0 && wt < 1) throw new H2OIllegalArgumentException("Quantiles only // accepts weights that are either 0 or >= 1."); sum += wt; } }
@Override public void map(Chunk chks[]) { Chunk cy = chk_resp(chks); for (int i = 0; i < cy._len; i++) { if (cy.isNA(i)) continue; if (isClassifier()) { int cls = (int) cy.at8(i); chk_work(chks, cls).set(i, 1L); } else { float pred = (float) cy.atd(i); chk_work(chks, 0).set(i, pred); } } }
public final Row extractDenseRow(Chunk[] chunks, int rid, Row row) { row.bad = false; row.rid = rid + chunks[0].start(); if (_weights) row.weight = chunks[weightChunkId()].atd(rid); if (row.weight == 0) return row; if (_skipMissing) for (Chunk c : chunks) if (c.isNA(rid)) { row.bad = true; return row; } int nbins = 0; for (int i = 0; i < _cats; ++i) { if (chunks[i].isNA(rid)) { if (_imputeMissing) { int c = getCategoricalId(i, _catModes[i]); if (c >= 0) row.binIds[nbins++] = c; } else // TODO: What if missingBucket = false? row.binIds[nbins++] = _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor } else { int c = getCategoricalId(i, (int) chunks[i].at8(rid)); if (c >= 0) row.binIds[nbins++] = c; } } row.nBins = nbins; final int n = _nums; for (int i = 0; i < n; ++i) { double d = chunks[_cats + i].atd(rid); // can be NA if skipMissing() == false if (_imputeMissing && Double.isNaN(d)) d = _numMeans[i]; if (_normMul != null && _normSub != null) d = (d - _normSub[i]) * _normMul[i]; row.numVals[i] = d; } for (int i = 0; i < _responses; ++i) { row.response[i] = chunks[responseChunkId()].atd(rid); if (_normRespMul != null) row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i]; if (Double.isNaN(row.response[i])) { row.bad = true; return row; } } if (_offset) row.offset = chunks[offsetChunkId()].atd(rid); return row; }
@Override public void map(Chunk cs) { int idx = _chunkOffset + cs.cidx(); Key ckey = Vec.chunkKey(_v._key, idx); if (_cmap != null) { assert !cs.hasFloat() : "Input chunk (" + cs.getClass() + ") has float, but is expected to be categorical"; NewChunk nc = new NewChunk(_v, idx); // loop over rows and update ints for new domain mapping according to vecs[c].domain() for (int r = 0; r < cs._len; ++r) { if (cs.isNA(r)) nc.addNA(); else nc.addNum(_cmap[(int) cs.at8(r)], 0); } nc.close(_fs); } else { DKV.put(ckey, cs.deepCopy(), _fs, true); } }
@Override public void map(Chunk[] chks) { final Chunk y = importance ? chk_resp(chks) : null; // Response final double[] rpred = importance ? new double[1 + _nclass] : null; // Row prediction final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees // Iterate over all rows for (int row = 0; row < oobt._len; row++) { final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow((int) chk_nids(chks, 0).at8(row)); // For all tree (i.e., k-classes) for (int k = 0; k < _nclass; k++) { final DTree tree = _trees[k]; if (tree == null) continue; // Empty class is ignored final Chunk nids = chk_nids(chks, k); // Node-ids for this tree/class int nid = (int) nids.at8(row); // Get Node to decide from // Update only out-of-bag rows // This is out-of-bag row - but we would like to track on-the-fly prediction for the row if (wasOOBRow) { final Chunk ct = chk_tree(chks, k); // k-tree working column holding votes for given row nid = ScoreBuildHistogram.oob2Nid(nid); if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree nid = tree.node(nid).pid(); // Then take parent's decision int leafnid; if (tree.root() instanceof LeafNode) { leafnid = 0; } else { DecidedNode dn = tree.decided(nid); // Must have a decision point if (dn._split.col() == -1) // Unable to decide? dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision leafnid = dn.ns(chks, row); // Decide down to a leafnode } // Setup Tree(i) - on the fly prediction of i-tree for row-th row // - for classification: cumulative number of votes for this row // - for regression: cumulative sum of prediction of each tree - has to be // normalized by number of trees double prediction = ((LeafNode) tree.node(leafnid)) .pred(); // Prediction for this k-class and this row if (importance) rpred[1 + k] = (float) prediction; // for both regression and classification ct.set(row, (float) (ct.atd(row) + prediction)); } // reset help column for this row and this k-class nids.set(row, 0); } /* end of k-trees iteration */ // For this tree this row is out-of-bag - i.e., a tree voted for this row if (wasOOBRow) oobt.set(row, oobt.atd(row) + 1); // track number of trees if (importance) { if (wasOOBRow && !y.isNA(row)) { if (isClassifier()) { int treePred = getPrediction(rpred, data_row(chks, row, rowdata), _threshold); int actuPred = (int) y.at8(row); if (treePred == actuPred) rightVotes++; // No miss ! } else { // regression double treePred = rpred[1]; double actuPred = y.atd(row); sse += (actuPred - treePred) * (actuPred - treePred); } allRows++; } } } }
@Override public void map(Chunk chks[]) { Chunk ys = chk_resp(chks); for (int row = 0; row < ys._len; row++) if (ys.isNA(row)) for (int t = 0; t < _nclass; t++) chk_nids(chks, t).set(row, -1); }
/** * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * * @param chunks - chunk of dataset * @param offset - adjustment for 0s if running with on-the-fly standardization (i.e. zeros are * not really zeros because of centering) * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk[] chunks, double offset) { Row[] rows = new Row[chunks[0]._len]; for (int i = 0; i < rows.length; ++i) { rows[i] = new Row(true, Math.min(_nums, 16), _cats, _responses, offset); rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; if (chunks[i].isNA(r)) { if (_skipMissing) { row.bad = true; } else row.binIds[row.nBins++] = _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor } else { int c = getCategoricalId(i, (int) chunks[i].at8(r)); if (c >= 0) row.binIds[row.nBins++] = c; } } } int numStart = numStart(); // generic numbers for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r; Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (_normMul != null) d *= _normMul[cid]; row.addNum(cid + numStart, d); } } // response(s) for (int i = 1; i <= _responses; ++i) { Chunk rChunk = chunks[responseChunkId()]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[row.response.length - i] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }
/** * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * * @param chunks - chunk of dataset * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk[] chunks) { Row[] rows = new Row[chunks[0]._len]; long startOff = chunks[0].start(); for (int i = 0; i < rows.length; ++i) { rows[i] = new Row( true, Math.min(_nums, 16), _cats, _responses, i, startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not // use numNums() rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } if (_skipMissing) { int N = _cats + _nums; for (int c = 0; c < N; ++c) if (chunks[c].isNA(i)) rows[i].bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; int cid = getCategoricalId(i, chunks[i].isNA(r) ? _catModes[i] : (int) chunks[i].at8(r)); if (cid >= 0) row.binIds[row.nBins++] = cid; } } // generic numbers + interactions int interactionOffset = 0; for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; if (c instanceof InteractionWrappedVec .InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot' // all other values are off (i.e., are 0) for (int r = 0; r < c._len; ++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has // one, and only one, value) Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; int cidVirtualOffset = getInteractionOffset( chunks, _cats + cid, r); // the "virtual" offset into the hot-expanded interaction row.addNum( _numOffsets[cid] + cidVirtualOffset, c.atd(r)); // FIXME: if this produces a "true" NA then should sub with mean? with? } interactionOffset += nextNumericIdx(cid); } else { for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r; Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (Double.isNaN(d)) d = _numMeans[cid]; if (_normMul != null) d *= _normMul[interactionOffset]; row.addNum(_numOffsets[cid], d); } interactionOffset++; } } // response(s) for (int i = 1; i <= _responses; ++i) { int rid = responseChunkId(i - 1); Chunk rChunk = chunks[rid]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[i - 1] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }