/** * Extract (dense) rows from given chunks, one Vec at a time - should be slightly faster than * per-row * * @param chunks - chunk of dataset * @return array of dense rows */ public final Row[] extractDenseRowsVertical(Chunk[] chunks) { Row[] rows = new Row[chunks[0]._len]; for (int i = 0; i < rows.length; ++i) { rows[i] = new Row(false, _nums, _cats, _responses, 0); rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } } for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; if (chunks[i].isNA(r)) { if (_skipMissing) { row.bad = true; } else row.binIds[row.nBins++] = _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor } else { int c = getCategoricalId(i, (int) chunks[i].at8(r)); if (c >= 0) row.binIds[row.nBins++] = c; } } } int numStart = numStart(); // generic numbers for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; for (int r = 0; r < c._len; ++r) { Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (_normMul != null && _normSub != null) // either none or both d = (d - _normSub[cid]) * _normMul[cid]; row.numVals[numStart + cid] = d; } } // response(s) for (int i = 1; i <= _responses; ++i) { Chunk rChunk = chunks[responseChunkId()]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[row.response.length - i] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }
@Override public void map(Chunk strata, Chunk newW) { for (int i = 0; i < strata._len; ++i) { // Log.info("NID:" + ((int) strata.at8(i))); if ((int) strata.at8(i) != stratumToKeep) newW.set(i, 0); } }
@Override public void map(Chunk cs[]) { int ncoly = cs.length; _ysum = new double[ncoly]; double[] yvals = new double[ncoly]; double yval; boolean add; int len = cs[0]._len; for (int row = 0; row < len; row++) { add = true; Arrays.fill(yvals, 0); for (int y = 0; y < ncoly; y++) { final Chunk cy = cs[y]; yval = cy.atd(row); // if any yval along a row is NA, discard the entire row if (Double.isNaN(yval)) { _NACount++; add = false; break; } yvals[y] = yval; } if (add) { ArrayUtils.add(_ysum, yvals); } } }
@Override public void map(Chunk nids, Chunk ys) { Random rand = _tree.rngForChunk(nids.cidx()); for (int row = 0; row < nids._len; row++) if (rand.nextFloat() >= _rate || Double.isNaN(ys.atd(row))) { nids.set(row, ScoreBuildHistogram.OUT_OF_BAG); // Flag row as being ignored by sampling } }
@Override public void map(Chunk c, Chunk w) { for (int i = 0; i < c.len(); ++i) if (!c.isNA(i)) { double wt = w.atd(i); // For now: let the user give small weights, results are probably not very good // (same as for wtd.quantile in R) // if (wt > 0 && wt < 1) throw new H2OIllegalArgumentException("Quantiles only // accepts weights that are either 0 or >= 1."); sum += wt; } }
@Override public void map(Chunk response, Chunk weight, Chunk offset) { for (int i = 0; i < response._len; ++i) { if (response.isNA(i)) continue; double w = weight.atd(i); if (w == 0) continue; double y = response.atd(i); double o = offset.atd(i); _num += _dist.initFNum(w, o, y); _denom += _dist.initFDenom(w, o); } }
@Override public void map(Chunk cs[]) { int ncolx = _xmeans.length; int ncoly = _ymeans.length; double[] xvals = new double[ncolx]; double[] yvals = new double[ncoly]; _covs = new double[ncoly][ncolx]; double[] _covs_y; double xval, yval, ymean; boolean add; int len = cs[0]._len; for (int row = 0; row < len; row++) { add = true; // reset existing arrays to 0 rather than initializing new ones to save on garbage // collection Arrays.fill(xvals, 0); Arrays.fill(yvals, 0); for (int y = 0; y < ncoly; y++) { final Chunk cy = cs[y]; yval = cy.atd(row); // if any yval along a row is NA, discard the entire row if (Double.isNaN(yval)) { add = false; break; } yvals[y] = yval; } if (add) { for (int x = 0; x < ncolx; x++) { final Chunk cx = cs[x + ncoly]; xval = cx.atd(row); // if any xval along a row is NA, discard the entire row if (Double.isNaN(xval)) { add = false; break; } xvals[x] = xval; } } // add is true iff row has been traversed and found no NAs among yvals and xvals if (add) { for (int y = 0; y < ncoly; y++) { _covs_y = _covs[y]; yval = yvals[y]; ymean = _ymeans[y]; for (int x = 0; x < ncolx; x++) _covs_y[x] += (xvals[x] - _xmeans[x]) * (yval - ymean); } } } }
@Override public void map(Chunk chks[]) { Chunk cy = chk_resp(chks); for (int i = 0; i < cy._len; i++) { if (cy.isNA(i)) continue; if (isClassifier()) { int cls = (int) cy.at8(i); chk_work(chks, cls).set(i, 1L); } else { float pred = (float) cy.atd(i); chk_work(chks, 0).set(i, pred); } } }
@Override public void map(Chunk cs[]) { _xsum = new double[_ncolx]; _ysum = new double[_ncoly]; double[] xvals = new double[_ncolx]; double[] yvals = new double[_ncoly]; double xval, yval; boolean add; int len = cs[0]._len; for (int row = 0; row < len; row++) { add = true; // reset existing arrays to 0 rather than initializing new ones to save on garbage // collection Arrays.fill(xvals, 0); Arrays.fill(yvals, 0); for (int y = 0; y < _ncoly; y++) { final Chunk cy = cs[y]; yval = cy.atd(row); // if any yval along a row is NA, discard the entire row if (Double.isNaN(yval)) { _NACount++; add = false; break; } yvals[y] = yval; } if (add) { for (int x = 0; x < _ncolx; x++) { final Chunk cx = cs[x + _ncoly]; xval = cx.atd(row); // if any xval along a row is NA, discard the entire row if (Double.isNaN(xval)) { _NACount++; add = false; break; } xvals[x] = xval; } } // add is true iff row has been traversed and found no NAs among yvals and xvals if (add) { ArrayUtils.add(_xsum, xvals); ArrayUtils.add(_ysum, yvals); } } }
@Override public void map(Chunk cs[]) { final int ncolsx = cs.length - 1; final Chunk cy = cs[0]; final int len = cy._len; _covs = new double[ncolsx]; double sum; for (int x = 0; x < ncolsx; x++) { sum = 0; final Chunk cx = cs[x + 1]; final double xmean = _xmeans[x]; for (int row = 0; row < len; row++) sum += (cx.atd(row) - xmean) * (cy.atd(row) - _ymean); _covs[x] = sum; } }
public final Row extractDenseRow(Chunk[] chunks, int rid, Row row) { row.bad = false; row.rid = rid + chunks[0].start(); if (_weights) row.weight = chunks[weightChunkId()].atd(rid); if (row.weight == 0) return row; if (_skipMissing) for (Chunk c : chunks) if (c.isNA(rid)) { row.bad = true; return row; } int nbins = 0; for (int i = 0; i < _cats; ++i) { if (chunks[i].isNA(rid)) { if (_imputeMissing) { int c = getCategoricalId(i, _catModes[i]); if (c >= 0) row.binIds[nbins++] = c; } else // TODO: What if missingBucket = false? row.binIds[nbins++] = _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor } else { int c = getCategoricalId(i, (int) chunks[i].at8(rid)); if (c >= 0) row.binIds[nbins++] = c; } } row.nBins = nbins; final int n = _nums; for (int i = 0; i < n; ++i) { double d = chunks[_cats + i].atd(rid); // can be NA if skipMissing() == false if (_imputeMissing && Double.isNaN(d)) d = _numMeans[i]; if (_normMul != null && _normSub != null) d = (d - _normSub[i]) * _normMul[i]; row.numVals[i] = d; } for (int i = 0; i < _responses; ++i) { row.response[i] = chunks[responseChunkId()].atd(rid); if (_normRespMul != null) row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i]; if (Double.isNaN(row.response[i])) { row.bad = true; return row; } } if (_offset) row.offset = chunks[offsetChunkId()].atd(rid); return row; }
/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
@Override public void map(Chunk chks[]) { Chunk ys = chk_resp(chks); if (_nclass > 1) { // Classification float fs[] = new float[_nclass + 1]; for (int row = 0; row < ys._len; row++) { float sum = score1(chks, fs, row); if (Float.isInfinite(sum)) // Overflow (happens for constant responses) for (int k = 0; k < _nclass; k++) chk_work(chks, k).set0(row, Float.isInfinite(fs[k + 1]) ? 1.0f : 0.0f); else for (int k = 0; k < _nclass; k++) // Save as a probability distribution chk_work(chks, k).set0(row, fs[k + 1] / sum); } } else { // Regression Chunk tr = chk_tree(chks, 0); // Prior tree sums Chunk wk = chk_work(chks, 0); // Predictions for (int row = 0; row < ys._len; row++) wk.set0(row, (float) tr.at0(row)); } }
@Override public void map(Chunk ca, Chunk cp) { _cms = new hex.ConfusionMatrix[_thresh.length]; for (int i = 0; i < _cms.length; ++i) _cms[i] = new hex.ConfusionMatrix(2); final int len = Math.min(ca._len, cp._len); for (int i = 0; i < len; i++) { if (ca.isNA0(i)) continue; // throw new UnsupportedOperationException("Actual class label cannot be a missing // value!"); final int a = (int) ca.at80(i); // would be a 0 if double was NaN assert (a == 0 || a == 1) : "Invalid values in vactual: must be binary (0 or 1)."; if (cp.isNA0(i)) { // Log.warn("Skipping predicted NaN."); //some models predict NaN! continue; } final double pr = cp.at0(i); for (int t = 0; t < _cms.length; t++) { final int p = pr >= _thresh[t] ? 1 : 0; _cms[t].add(a, p); } } }
@Override public void map(Chunk chks[]) { Chunk ys = chk_resp(chks); if (_nclass > 1) { // Classification for (int row = 0; row < ys._len; row++) { if (ys.isNA0(row)) continue; int y = (int) ys.at80(row); // zero-based response variable // Actual is '1' for class 'y' and '0' for all other classes for (int k = 0; k < _nclass; k++) { if (_distribution[k] != 0) { Chunk wk = chk_work(chks, k); wk.set0(row, (y == k ? 1f : 0f) - (float) wk.at0(row)); } } } } else { // Regression Chunk wk = chk_work(chks, 0); // Prediction==>Residuals for (int row = 0; row < ys._len; row++) wk.set0(row, (float) (ys.at0(row) - wk.at0(row))); } }
@Override public void map(Chunk chk, Chunk weight) { _bins = new double[_nbins]; _mins = new double[_nbins]; _maxs = new double[_nbins]; Arrays.fill(_mins, Double.MAX_VALUE); Arrays.fill(_maxs, -Double.MAX_VALUE); double d; for (int row = 0; row < chk._len; row++) { double w = weight.atd(row); if (w == 0) continue; if (!Double.isNaN(d = chk.atd(row))) { // na.rm=true double idx = (d - _lb) / _step; if (!(0.0 <= idx && idx < _bins.length)) continue; int i = (int) idx; if (_bins[i] == 0) _mins[i] = _maxs[i] = d; // Capture unique value else { if (d < _mins[i]) _mins[i] = d; if (d > _maxs[i]) _maxs[i] = d; } _bins[i] += w; // Bump row counts by row weight } } }
@Override public void map(Chunk cs) { int idx = _chunkOffset + cs.cidx(); Key ckey = Vec.chunkKey(_v._key, idx); if (_cmap != null) { assert !cs.hasFloat() : "Input chunk (" + cs.getClass() + ") has float, but is expected to be categorical"; NewChunk nc = new NewChunk(_v, idx); // loop over rows and update ints for new domain mapping according to vecs[c].domain() for (int r = 0; r < cs._len; ++r) { if (cs.isNA(r)) nc.addNA(); else nc.addNum(_cmap[(int) cs.at8(r)], 0); } nc.close(_fs); } else { DKV.put(ckey, cs.deepCopy(), _fs, true); } }
@Override public void map(Chunk[] chks) { _gss = new double[_nclass][]; _rss = new double[_nclass][]; // For all tree/klasses for (int k = 0; k < _nclass; k++) { final DTree tree = _trees[k]; final int leaf = _leafs[k]; if (tree == null) continue; // Empty class is ignored // A leaf-biased array of all active Tree leaves. final double gs[] = _gss[k] = new double[tree._len - leaf]; final double rs[] = _rss[k] = new double[tree._len - leaf]; final Chunk nids = chk_nids(chks, k); // Node-ids for this tree/class final Chunk ress = chk_work(chks, k); // Residuals for this tree/class // If we have all constant responses, then we do not split even the // root and the residuals should be zero. if (tree.root() instanceof LeafNode) continue; for (int row = 0; row < nids._len; row++) { // For all rows int nid = (int) nids.at80(row); // Get Node to decide from if (nid < 0) continue; // Missing response if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree nid = tree.node(nid)._pid; // Then take parent's decision DecidedNode dn = tree.decided(nid); // Must have a decision point if (dn._split._col == -1) // Unable to decide? dn = tree.decided(nid = dn._pid); // Then take parent's decision int leafnid = dn.ns(chks, row); // Decide down to a leafnode assert leaf <= leafnid && leafnid < tree._len; assert tree.node(leafnid) instanceof LeafNode; // Note: I can which leaf/region I end up in, but I do not care for // the prediction presented by the tree. For GBM, we compute the // sum-of-residuals (and sum/abs/mult residuals) for all rows in the // leaf, and get our prediction from that. nids.set0(row, leafnid); assert !ress.isNA0(row); double res = ress.at0(row); double ares = Math.abs(res); gs[leafnid - leaf] += _nclass > 1 ? ares * (1 - ares) : 1; rs[leafnid - leaf] += res; } } }
@Override public void map(Chunk chk) { map(chk, new C0DChunk(1, chk.len())); }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_job != null && _job.self() != null && !Job.isRunning(_job.self())) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0]._start; chunkInit(); double[] nums = MemoryManager.malloc8d(_dinfo._nums); int[] cats = MemoryManager.malloc4(_dinfo._cats); double[] response = MemoryManager.malloc8d(_dinfo._responses); int start = 0; int end = nrows; boolean contiguous = false; Random skip_rng = null; // random generator for skipping rows if (_useFraction < 1.0) { skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong()); if (contiguous) { final int howmany = (int) Math.ceil(_useFraction * nrows); if (howmany > 0) { start = skip_rng.nextInt(nrows - howmany); end = start + howmany; } assert (start < nrows); assert (end <= nrows); } } long[] shuf_map = null; if (_shuffle) { shuf_map = new long[end - start]; for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i; Utils.shuffleArray(shuf_map, new Random().nextLong()); } OUTER: for (int rr = start; rr < end; ++rr) { final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr; if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId) || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue; for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs! int i = 0, ncats = 0; for (; i < _dinfo._cats; ++i) { int c = (int) chunks[i].at80(r); if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1; } final int n = chunks.length - _dinfo._responses; for (; i < n; ++i) { double d = chunks[i].at0(r); if (_dinfo._normMul != null) d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats]; nums[i - _dinfo._cats] = d; } for (i = 0; i < _dinfo._responses; ++i) { response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r); if (_dinfo._normRespMul != null) response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i]; } if (outputs != null && outputs.length > 0) processRow(offset + r, nums, ncats, cats, response, outputs); else processRow(offset + r, nums, ncats, cats, response); } chunkDone(); }
@Override public void map(Chunk chks[]) { Chunk ys = chk_resp(chks); for (int row = 0; row < ys._len; row++) if (ys.isNA0(row)) for (int t = 0; t < _nclass; t++) chk_nids(chks, t).set0(row, -1); }
/** * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * * @param chunks - chunk of dataset * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk[] chunks) { Row[] rows = new Row[chunks[0]._len]; long startOff = chunks[0].start(); for (int i = 0; i < rows.length; ++i) { rows[i] = new Row( true, Math.min(_nums, 16), _cats, _responses, i, startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not // use numNums() rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } if (_skipMissing) { int N = _cats + _nums; for (int c = 0; c < N; ++c) if (chunks[c].isNA(i)) rows[i].bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; int cid = getCategoricalId(i, chunks[i].isNA(r) ? _catModes[i] : (int) chunks[i].at8(r)); if (cid >= 0) row.binIds[row.nBins++] = cid; } } // generic numbers + interactions int interactionOffset = 0; for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; if (c instanceof InteractionWrappedVec .InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot' // all other values are off (i.e., are 0) for (int r = 0; r < c._len; ++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has // one, and only one, value) Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; int cidVirtualOffset = getInteractionOffset( chunks, _cats + cid, r); // the "virtual" offset into the hot-expanded interaction row.addNum( _numOffsets[cid] + cidVirtualOffset, c.atd(r)); // FIXME: if this produces a "true" NA then should sub with mean? with? } interactionOffset += nextNumericIdx(cid); } else { for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r; Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (Double.isNaN(d)) d = _numMeans[cid]; if (_normMul != null) d *= _normMul[interactionOffset]; row.addNum(_numOffsets[cid], d); } interactionOffset++; } } // response(s) for (int i = 1; i <= _responses; ++i) { int rid = responseChunkId(i - 1); Chunk rChunk = chunks[rid]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[i - 1] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }
@Override public void map(Chunk[] chks) { final Chunk y = importance ? chk_resp(chks) : null; // Response final double[] rpred = importance ? new double[1 + _nclass] : null; // Row prediction final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees // Iterate over all rows for (int row = 0; row < oobt._len; row++) { final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow((int) chk_nids(chks, 0).at8(row)); // For all tree (i.e., k-classes) for (int k = 0; k < _nclass; k++) { final DTree tree = _trees[k]; if (tree == null) continue; // Empty class is ignored final Chunk nids = chk_nids(chks, k); // Node-ids for this tree/class int nid = (int) nids.at8(row); // Get Node to decide from // Update only out-of-bag rows // This is out-of-bag row - but we would like to track on-the-fly prediction for the row if (wasOOBRow) { final Chunk ct = chk_tree(chks, k); // k-tree working column holding votes for given row nid = ScoreBuildHistogram.oob2Nid(nid); if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree nid = tree.node(nid).pid(); // Then take parent's decision int leafnid; if (tree.root() instanceof LeafNode) { leafnid = 0; } else { DecidedNode dn = tree.decided(nid); // Must have a decision point if (dn._split.col() == -1) // Unable to decide? dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision leafnid = dn.ns(chks, row); // Decide down to a leafnode } // Setup Tree(i) - on the fly prediction of i-tree for row-th row // - for classification: cumulative number of votes for this row // - for regression: cumulative sum of prediction of each tree - has to be // normalized by number of trees double prediction = ((LeafNode) tree.node(leafnid)) .pred(); // Prediction for this k-class and this row if (importance) rpred[1 + k] = (float) prediction; // for both regression and classification ct.set(row, (float) (ct.atd(row) + prediction)); } // reset help column for this row and this k-class nids.set(row, 0); } /* end of k-trees iteration */ // For this tree this row is out-of-bag - i.e., a tree voted for this row if (wasOOBRow) oobt.set(row, oobt.atd(row) + 1); // track number of trees if (importance) { if (wasOOBRow && !y.isNA(row)) { if (isClassifier()) { int treePred = getPrediction(rpred, data_row(chks, row, rowdata), _threshold); int actuPred = (int) y.at8(row); if (treePred == actuPred) rightVotes++; // No miss ! } else { // regression double treePred = rpred[1]; double actuPred = y.atd(row); sse += (actuPred - treePred) * (actuPred - treePred); } allRows++; } } } }
@Override public void map(Chunk ca, Chunk cp) { // classification if (_c_len > 1) { _cm = new long[_c_len + 1][_c_len + 1]; int len = Math.min( ca._len, cp._len); // handle different lenghts, but the vectors should have been rejected // already for (int i = 0; i < len; i++) { int a = ca.isNA0(i) ? _c_len : (int) ca.at80(i); int p = cp.isNA0(i) ? _c_len : (int) cp.at80(i); _cm[a][p]++; } if (len < ca._len) for (int i = len; i < ca._len; i++) _cm[ca.isNA0(i) ? _c_len : (int) ca.at80(i)][_c_len]++; if (len < cp._len) for (int i = len; i < cp._len; i++) _cm[_c_len][cp.isNA0(i) ? _c_len : (int) cp.at80(i)]++; } else { _cm = null; _mse = 0; assert (ca._len == cp._len); int len = ca._len; for (int i = 0; i < len; i++) { if (ca.isNA0(i) || cp.isNA0(i)) continue; // TODO: Improve final double a = ca.at0(i); final double p = cp.at0(i); _mse += (p - a) * (p - a); _count++; } } }
public Rows rows(Chunk[] chks) { int cnt = 0; for (Chunk c : chks) if (c.isSparse()) ++cnt; return rows(chks, cnt > (chks.length >> 1)); }
/** * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * * @param chunks - chunk of dataset * @param offset - adjustment for 0s if running with on-the-fly standardization (i.e. zeros are * not really zeros because of centering) * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk[] chunks, double offset) { Row[] rows = new Row[chunks[0]._len]; for (int i = 0; i < rows.length; ++i) { rows[i] = new Row(true, Math.min(_nums, 16), _cats, _responses, offset); rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; if (chunks[i].isNA(r)) { if (_skipMissing) { row.bad = true; } else row.binIds[row.nBins++] = _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor } else { int c = getCategoricalId(i, (int) chunks[i].at8(r)); if (c >= 0) row.binIds[row.nBins++] = c; } } } int numStart = numStart(); // generic numbers for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r; Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (_normMul != null) d *= _normMul[cid]; row.addNum(cid + numStart, d); } } // response(s) for (int i = 1; i <= _responses; ++i) { Chunk rChunk = chunks[responseChunkId()]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[row.response.length - i] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }
@Override public void map(Chunk[] chks) { final Chunk y = importance ? chk_resp(chks) : null; // Response final float[] rpred = importance ? new float[1 + _nclass] : null; // Row prediction final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees // Iterate over all rows for (int row = 0; row < oobt._len; row++) { boolean wasOOBRow = false; // For all tree (i.e., k-classes) for (int k = 0; k < _nclass; k++) { final DTree tree = _trees[k]; if (tree == null) continue; // Empty class is ignored // If we have all constant responses, then we do not split even the // root and the residuals should be zero. if (tree.root() instanceof LeafNode) continue; final Chunk nids = chk_nids(chks, k); // Node-ids for this tree/class final Chunk ct = chk_tree(chks, k); // k-tree working column holding votes for given row int nid = (int) nids.at80(row); // Get Node to decide from // Update only out-of-bag rows // This is out-of-bag row - but we would like to track on-the-fly prediction for the row if (isOOBRow(nid)) { // The row should be OOB for all k-trees !!! assert k == 0 || wasOOBRow : "Something is wrong: k-class trees oob row computing is broken! All k-trees should agree on oob row!"; wasOOBRow = true; nid = oob2Nid(nid); if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree nid = tree.node(nid).pid(); // Then take parent's decision DecidedNode dn = tree.decided(nid); // Must have a decision point if (dn._split.col() == -1) // Unable to decide? dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision int leafnid = dn.ns(chks, row); // Decide down to a leafnode // Setup Tree(i) - on the fly prediction of i-tree for row-th row // - for classification: cumulative number of votes for this row // - for regression: cumulative sum of prediction of each tree - has to be normalized // by number of trees double prediction = ((LeafNode) tree.node(leafnid)).pred(); // Prediction for this k-class and this row if (importance) rpred[1 + k] = (float) prediction; // for both regression and classification ct.set0(row, (float) (ct.at0(row) + prediction)); // For this tree this row is out-of-bag - i.e., a tree voted for this row oobt.set0( row, _nclass > 1 ? 1 : oobt.at0(row) + 1); // for regression track number of trees, for classification boolean // flag is enough } // reset help column for this row and this k-class nids.set0(row, 0); } /* end of k-trees iteration */ if (importance) { if (wasOOBRow && !y.isNA0(row)) { if (classification) { int treePred = ModelUtils.getPrediction(rpred, data_row(chks, row, rowdata)); int actuPred = (int) y.at80(row); if (treePred == actuPred) rightVotes++; // No miss ! } else { // regression float treePred = rpred[1]; float actuPred = (float) y.at0(row); sse += (actuPred - treePred) * (actuPred - treePred); } allRows++; } } } }
@Override public void map(Chunk c, NewChunk nc) { double acc = _init; for (int i = 0; i < c._len; ++i) nc.addNum(acc = op(acc, c.atd(i))); _chkCumu[c.cidx()] = acc; }
@Override public void map(Chunk ys) { _ys = new long[_nclass]; for (int i = 0; i < ys._len; i++) if (!ys.isNA0(i)) _ys[(int) ys.at80(i)]++; }