/** * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one + * one-to-all) * * @param fr Input frame * @param seed RNG seed * @param shuffle whether to shuffle the data globally * @return Shuffled frame */ public static Frame shuffleAndBalance( final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) { if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) { Vec[] vecs = fr.vecs().clone(); Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks."); long[] idx = null; if (shuffle) { idx = new long[splits]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); } Key keys[] = new Vec.VectorGroup().addVecs(vecs.length); final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits)); // loop over cols (same indexing for each column) Futures fs = new Futures(); for (int col = 0; col < vecs.length; col++) { AppendableVec vec = new AppendableVec(keys[col]); // create outgoing chunks for this col NewChunk[] outCkg = new NewChunk[splits]; for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i); // loop over all incoming chunks for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) { final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg); // loop over local rows of incoming chunks (fast path) for (int row = 0; row < inCkg._len; ++row) { int outCkgIdx = (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx if (shuffle) outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk assert (outCkgIdx >= 0 && outCkgIdx < splits); outCkg[outCkgIdx].addNum(inCkg.at0(row)); } } for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs); Vec t = vec.close(fs); t._domain = vecs[col]._domain; vecs[col] = t; } fs.blockForPending(); Log.info("Load balancing done."); return new Frame(fr.names(), vecs); } return fr; }
// Do any final actions on a completed NewVector. Mostly: compress it, and // do a DKV put on an appropriate Key. The original NewVector goes dead // (does not live on inside the K/V store). public Chunk new_close(Futures fs) { Chunk chk = compress(); if (_vec instanceof AppendableVec) ((AppendableVec) _vec).closeChunk(this); return chk; }
public Frame deepSlice(Object orows, Object ocols) { // ocols is either a long[] or a Frame-of-1-Vec long[] cols; if (ocols == null) { cols = (long[]) ocols; assert cols == null; } else { if (ocols instanceof long[]) { cols = (long[]) ocols; } else if (ocols instanceof Frame) { Frame fr = (Frame) ocols; if (fr.numCols() != 1) { throw new IllegalArgumentException( "Columns Frame must have only one column (actually has " + fr.numCols() + " columns)"); } long n = fr.anyVec().length(); if (n > MAX_EQ2_COLS) { throw new IllegalArgumentException( "Too many requested columns (requested " + n + ", max " + MAX_EQ2_COLS + ")"); } cols = new long[(int) n]; Vec v = fr._vecs[0]; for (long i = 0; i < v.length(); i++) { cols[(int) i] = v.at8(i); } } else { throw new IllegalArgumentException( "Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")"); } } // Since cols is probably short convert to a positive list. int c2[] = null; if (cols == null) { c2 = new int[numCols()]; for (int i = 0; i < c2.length; i++) c2[i] = i; } else if (cols.length == 0) { c2 = new int[0]; } else if (cols[0] > 0) { c2 = new int[cols.length]; for (int i = 0; i < cols.length; i++) c2[i] = (int) cols[i] - 1; // Convert 1-based cols to zero-based } else { c2 = new int[numCols() - cols.length]; int j = 0; for (int i = 0; i < numCols(); i++) { if (j >= cols.length || i < (-cols[j] - 1)) c2[i - j] = i; else j++; } } for (int i = 0; i < c2.length; i++) if (c2[i] >= numCols()) throw new IllegalArgumentException( "Trying to select column " + c2[i] + " but only " + numCols() + " present."); if (c2.length == 0) throw new IllegalArgumentException( "No columns selected (did you try to select column 0 instead of column 1?)"); // Do Da Slice // orows is either a long[] or a Vec if (orows == null) return new DeepSlice((long[]) orows, c2) .doAll(c2.length, this) .outputFrame(names(c2), domains(c2)); else if (orows instanceof long[]) { final long CHK_ROWS = 1000000; long[] rows = (long[]) orows; if (rows.length == 0) return new DeepSlice(rows, c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2)); if (rows[0] < 0) return new DeepSlice(rows, c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2)); // Vec'ize the index array AppendableVec av = new AppendableVec("rownames"); int r = 0; int c = 0; while (r < rows.length) { NewChunk nc = new NewChunk(av, c); long end = Math.min(r + CHK_ROWS, rows.length); for (; r < end; r++) { nc.addNum(rows[r]); } nc.close(c++, null); } Vec c0 = av.close(null); // c0 is the row index vec Frame fr2 = new Slice(c2, this) .doAll(c2.length, new Frame(new String[] {"rownames"}, new Vec[] {c0})) .outputFrame(names(c2), domains(c2)); UKV.remove(c0._key); // Remove hidden vector return fr2; } Frame frows = (Frame) orows; Vec vrows = frows.anyVec(); // It's a compatible Vec; use it as boolean selector. // Build column names for the result. Vec[] vecs = new Vec[c2.length + 1]; String[] names = new String[c2.length + 1]; for (int i = 0; i < c2.length; ++i) { vecs[i] = _vecs[c2[i]]; names[i] = _names[c2[i]]; } vecs[c2.length] = vrows; names[c2.length] = "predicate"; return new DeepSelect() .doAll(c2.length, new Frame(names, vecs)) .outputFrame(names(c2), domains(c2)); }