public final Row extractDenseRow(double[] vals, Row row) { row.bad = false; row.rid = 0; row.cid = 0; if (row.weight == 0) return row; if (_skipMissing) for (double d : vals) if (Double.isNaN(d)) { row.bad = true; return row; } int nbins = 0; for (int i = 0; i < _cats; ++i) { int c = getCategoricalId(i, Double.isNaN(vals[i]) ? _catModes[i] : (int) vals[i]); if (c >= 0) row.binIds[nbins++] = c; } row.nBins = nbins; final int n = _nums; int numValsIdx = 0; for (int i = 0; i < n; ++i) { if (isInteractionVec(i)) { int offset; InteractionWrappedVec iwv = ((InteractionWrappedVec) _adaptedFrame.vec(_cats + i)); int v1 = _adaptedFrame.find(iwv.v1()); int v2 = _adaptedFrame.find(iwv.v2()); if (v1 < _cats) offset = getCategoricalId(v1, Double.isNaN(vals[v1]) ? _catModes[v1] : (int) vals[v1]); else if (v2 < _cats) offset = getCategoricalId(v2, Double.isNaN(vals[v2]) ? _catModes[v1] : (int) vals[v2]); else offset = 0; row.numVals[numValsIdx + offset] = vals[_cats + i]; // essentially: vals[v1] * vals[v2]) numValsIdx += nextNumericIdx(i); } else { double d = vals[_cats + i]; // can be NA if skipMissing() == false if (Double.isNaN(d)) d = _numMeans[numValsIdx]; if (_normMul != null && _normSub != null) d = (d - _normSub[numValsIdx]) * _normMul[numValsIdx]; row.numVals[numValsIdx++] = d; } } int off = responseChunkId(0); for (int i = off; i < Math.min(vals.length, off + _responses); ++i) { try { row.response[i] = vals[responseChunkId(i)]; } catch (Throwable t) { throw new RuntimeException(t); } if (_normRespMul != null) row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i]; if (Double.isNaN(row.response[i])) { row.bad = true; return row; } } return row; }
public void addNum(int id, double val) { if (numIds.length == nNums) { int newSz = Math.max(4, numIds.length + (numIds.length >> 1)); numIds = Arrays.copyOf(numIds, newSz); numVals = Arrays.copyOf(numVals, newSz); } int i = nNums++; numIds[i] = id; numVals[i] = val; }
public TestUtil(int minCloudSize) { MINCLOUDSIZE = Math.max(MINCLOUDSIZE, minCloudSize); String ignoreTests = System.getProperty("ignore.tests"); if (ignoreTests != null) { ignoreTestsNames = ignoreTests.split(","); if (ignoreTestsNames.length == 1 && ignoreTestsNames[0].equals("")) { ignoreTestsNames = null; } } String doonlyTests = System.getProperty("doonly.tests"); if (doonlyTests != null) { doonlyTestsNames = doonlyTests.split(","); if (doonlyTestsNames.length == 1 && doonlyTestsNames[0].equals("")) { doonlyTestsNames = null; } } }
/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
@Override protected void compute2() { final int ncols = _frs[1].numCols(); addToPendingCount(ncols - 1); for (int i = 0; i < Math.min(MAXP, ncols); ++i) forkVecTask(i); }
/** * Stratified sampling for classifiers * * @param fr Input frame * @param label Label vector (must be enum) * @param sampling_ratios Optional: array containing the requested sampling ratios per class (in * order of domains), will be overwritten if it contains all 0s * @param maxrows Maximum number of rows in the returned frame * @param seed RNG seed for sampling * @param allowOversampling Allow oversampling of minority classes * @param verbose Whether to print verbose info * @return Sampled frame, with approximately the same number of samples from each class (or given * by the requested sampling ratios) */ public static Frame sampleFrameStratified( final Frame fr, Vec label, float[] sampling_ratios, long maxrows, final long seed, final boolean allowOversampling, final boolean verbose) { if (fr == null) return null; assert (label.isEnum()); assert (maxrows >= label.domain().length); long[] dist = new ClassDist(label).doAll(label).dist(); assert (dist.length > 0); Log.info( "Doing stratified sampling for data set containing " + fr.numRows() + " rows from " + dist.length + " classes. Oversampling: " + (allowOversampling ? "on" : "off")); if (verbose) { for (int i = 0; i < dist.length; ++i) { Log.info( "Class " + label.domain(i) + ": count: " + dist[i] + " prior: " + (float) dist[i] / fr.numRows()); } } // create sampling_ratios for class balance with max. maxrows rows (fill existing array if not // null) if (sampling_ratios == null || (Utils.minValue(sampling_ratios) == 0 && Utils.maxValue(sampling_ratios) == 0)) { // compute sampling ratios to achieve class balance if (sampling_ratios == null) { sampling_ratios = new float[dist.length]; } assert (sampling_ratios.length == dist.length); for (int i = 0; i < dist.length; ++i) { sampling_ratios[i] = ((float) fr.numRows() / label.domain().length) / dist[i]; // prior^-1 / num_classes } final float inv_scale = Utils.minValue( sampling_ratios); // majority class has lowest required oversampling factor to achieve // balance if (!Float.isNaN(inv_scale) && !Float.isInfinite(inv_scale)) Utils.div( sampling_ratios, inv_scale); // want sampling_ratio 1.0 for majority class (no downsampling) } if (!allowOversampling) { for (int i = 0; i < sampling_ratios.length; ++i) { sampling_ratios[i] = Math.min(1.0f, sampling_ratios[i]); } } // given these sampling ratios, and the original class distribution, this is the expected number // of resulting rows float numrows = 0; for (int i = 0; i < sampling_ratios.length; ++i) { numrows += sampling_ratios[i] * dist[i]; } final long actualnumrows = Math.min(maxrows, Math.round(numrows)); // cap #rows at maxrows assert (actualnumrows >= 0); // can have no matching rows in case of sparse data where we had to fill in a // makeZero() vector Log.info("Stratified sampling to a total of " + String.format("%,d", actualnumrows) + " rows."); if (actualnumrows != numrows) { Utils.mult( sampling_ratios, (float) actualnumrows / numrows); // adjust the sampling_ratios by the global rescaling factor if (verbose) Log.info( "Downsampling majority class by " + (float) actualnumrows / numrows + " to limit number of rows to " + String.format("%,d", maxrows)); } Log.info( "Majority class (" + label.domain()[Utils.minIndex(sampling_ratios)].toString() + ") sampling ratio: " + Utils.minValue(sampling_ratios)); Log.info( "Minority class (" + label.domain()[Utils.maxIndex(sampling_ratios)].toString() + ") sampling ratio: " + Utils.maxValue(sampling_ratios)); return sampleFrameStratified(fr, label, sampling_ratios, seed, verbose); }
public static int imputeCat(Vec v) { if (v.isCategorical()) return v.mode(); return (int) Math.round(v.mean()); }
/** * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * * @param chunks - chunk of dataset * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk[] chunks) { Row[] rows = new Row[chunks[0]._len]; long startOff = chunks[0].start(); for (int i = 0; i < rows.length; ++i) { rows[i] = new Row( true, Math.min(_nums, 16), _cats, _responses, i, startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not // use numNums() rows[i].rid = chunks[0].start() + i; if (_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if (Double.isNaN(rows[i].offset)) rows[i].bad = true; } if (_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if (Double.isNaN(rows[i].weight)) rows[i].bad = true; } if (_skipMissing) { int N = _cats + _nums; for (int c = 0; c < N; ++c) if (chunks[c].isNA(i)) rows[i].bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; int cid = getCategoricalId(i, chunks[i].isNA(r) ? _catModes[i] : (int) chunks[i].at8(r)); if (cid >= 0) row.binIds[row.nBins++] = cid; } } // generic numbers + interactions int interactionOffset = 0; for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; if (c instanceof InteractionWrappedVec .InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot' // all other values are off (i.e., are 0) for (int r = 0; r < c._len; ++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has // one, and only one, value) Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; int cidVirtualOffset = getInteractionOffset( chunks, _cats + cid, r); // the "virtual" offset into the hot-expanded interaction row.addNum( _numOffsets[cid] + cidVirtualOffset, c.atd(r)); // FIXME: if this produces a "true" NA then should sub with mean? with? } interactionOffset += nextNumericIdx(cid); } else { for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r; Row row = rows[r]; if (row.bad) continue; if (c.isNA(r)) row.bad = _skipMissing; double d = c.atd(r); if (Double.isNaN(d)) d = _numMeans[cid]; if (_normMul != null) d *= _normMul[interactionOffset]; row.addNum(_numOffsets[cid], d); } interactionOffset++; } } // response(s) for (int i = 1; i <= _responses; ++i) { int rid = responseChunkId(i - 1); Chunk rChunk = chunks[rid]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; if (row.bad) continue; row.response[i - 1] = rChunk.atd(r); if (_normRespMul != null) { row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1]; } if (Double.isNaN(row.response[row.response.length - i])) row.bad = true; } } return rows; }