/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_job != null && _job.self() != null && !Job.isRunning(_job.self())) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0]._start; chunkInit(); double[] nums = MemoryManager.malloc8d(_dinfo._nums); int[] cats = MemoryManager.malloc4(_dinfo._cats); double[] response = MemoryManager.malloc8d(_dinfo._responses); int start = 0; int end = nrows; boolean contiguous = false; Random skip_rng = null; // random generator for skipping rows if (_useFraction < 1.0) { skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong()); if (contiguous) { final int howmany = (int) Math.ceil(_useFraction * nrows); if (howmany > 0) { start = skip_rng.nextInt(nrows - howmany); end = start + howmany; } assert (start < nrows); assert (end <= nrows); } } long[] shuf_map = null; if (_shuffle) { shuf_map = new long[end - start]; for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i; Utils.shuffleArray(shuf_map, new Random().nextLong()); } OUTER: for (int rr = start; rr < end; ++rr) { final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr; if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId) || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue; for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs! int i = 0, ncats = 0; for (; i < _dinfo._cats; ++i) { int c = (int) chunks[i].at80(r); if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1; } final int n = chunks.length - _dinfo._responses; for (; i < n; ++i) { double d = chunks[i].at0(r); if (_dinfo._normMul != null) d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats]; nums[i - _dinfo._cats] = d; } for (i = 0; i < _dinfo._responses; ++i) { response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r); if (_dinfo._normRespMul != null) response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i]; } if (outputs != null && outputs.length > 0) processRow(offset + r, nums, ncats, cats, response, outputs); else processRow(offset + r, nums, ncats, cats, response); } chunkDone(); }