@Test public void testDomains() { Frame frame = parse_test_file("smalldata/junit/weather.csv"); for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) { Vec v = frame.vec(s); Vec newV = v.toCategoricalVec(); frame.remove(s); frame.add(s, newV); v.remove(); } DKV.put(frame); AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters(); parms._train = frame._key; parms._radius_scale = 10; AggregatorModel agg = new Aggregator(parms).trainModel().get(); Frame output = agg._output._output_frame.get(); Assert.assertTrue(output.numRows() < 0.5 * frame.numRows()); boolean same = true; for (int i = 0; i < frame.numCols(); ++i) { if (frame.vec(i).isCategorical()) { same = (frame.domains()[i].length == output.domains()[i].length); if (!same) break; } } frame.remove(); output.remove(); agg.remove(); Assert.assertFalse(same); }
/** * Sample rows from a frame. Can be unlucky for small sampling fractions - will continue calling * itself until at least 1 row is returned. * * @param fr Input frame * @param rows Approximate number of rows to sample (across all chunks) * @param seed Seed for RNG * @return Sampled frame */ public static Frame sampleFrame(Frame fr, final long rows, final long seed) { if (fr == null) return null; final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f; if (fraction >= 1.f) return fr; Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { final Random rng = getDeterRNG(seed + cs[0].cidx()); int count = 0; for (int r = 0; r < cs[0]._len; r++) if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) { count++; for (int i = 0; i < ncs.length; i++) { ncs[i].addNum(cs[i].at0(r)); } } } }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains()); if (r.numRows() == 0) { Log.warn( "You asked for " + rows + " rows (out of " + fr.numRows() + "), but you got none (seed=" + seed + ")."); Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\""); return sampleFrame(fr, rows, seed + 1); } return r; }
/** * Project each archetype into original feature space * * @param frame Original training data with m rows and n columns * @param destination_key Frame Id for output * @return Frame containing k rows and n columns, where each row corresponds to an archetype */ public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) { final int ncols = _output._names.length; Frame adaptedFr = new Frame(frame); adaptTestForTrain(adaptedFr, true, false); assert ncols == adaptedFr.numCols(); String[][] adaptedDomme = adaptedFr.domains(); double[][] proj = new double[_parms._k][_output._nnums + _output._ncats]; // Categorical columns for (int d = 0; d < _output._ncats; d++) { double[][] block = _output._archetypes_raw.getCatBlock(d); for (int k = 0; k < _parms._k; k++) proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]); } // Numeric columns for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) { int ds = d - _output._ncats; for (int k = 0; k < _parms._k; k++) { double num = _output._archetypes_raw.getNum(ds, k); proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]); if (reverse_transform) proj[k][_output._permutation[d]] = proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds]; } } // Convert projection of archetypes into a frame with correct domains Frame f = ArrayUtils.frame( (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj); for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]); return f; }
@Override public void compute2() { _in.read_lock(_jobKey); // simply create a bogus new vector (don't even put it into KV) with appropriate number of lines // per chunk and then use it as a source to do multiple makeZero calls // to create empty vecs and than call RebalanceTask on each one of them. // RebalanceTask will fetch the appropriate src chunks and fetch the data from them. int rpc = (int) (_in.numRows() / _nchunks); int rem = (int) (_in.numRows() % _nchunks); long[] espc = new long[_nchunks + 1]; Arrays.fill(espc, rpc); for (int i = 0; i < rem; ++i) ++espc[i]; long sum = 0; for (int i = 0; i < espc.length; ++i) { long s = espc[i]; espc[i] = sum; sum += s; } assert espc[espc.length - 1] == _in.numRows() : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1]; final Vec[] srcVecs = _in.vecs(); _out = new Frame( _okey, _in.names(), new Vec(Vec.newKey(), espc).makeZeros(srcVecs.length, _in.domains())); _out.delete_and_lock(_jobKey); new RebalanceTask(this, srcVecs).asyncExec(_out); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); if (fr.numCols() == 1 && fr.numRows() == 1) { if (fr.anyVec().isNumeric() || fr.anyVec().isBad()) return new ValNum(fr.anyVec().at(0)); else if (fr.anyVec().isString()) return new ValStr(fr.anyVec().atStr(new BufferedString(), 0).toString()); return new ValStr(fr.domains()[0][(int) fr.anyVec().at8(0)]); } return new ValFrame(fr); // did not flatten }
// Make vector templates for all output frame vectors private Vec[][] makeTemplates(Frame dataset, float[] ratios) { Vec anyVec = dataset.anyVec(); final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios); final int num = dataset.numCols(); // number of columns in input frame final int nsplits = espcPerSplit.length; // number of splits final String[][] domains = dataset.domains(); // domains Vec[][] t = new Vec[nsplits][ /*num*/]; // resulting vectors for all for (int i = 0; i < nsplits; i++) { // vectors for j-th split t[i] = new Vec(Vec.newKey(), espcPerSplit[i /*-th split*/]).makeZeros(num, domains); } return t; }
public static Frame shuffleFramePerChunk(Key outputFrameKey, Frame fr, final long seed) { Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { long[] idx = new long[cs[0]._len]; for (int r = 0; r < idx.length; ++r) idx[r] = r; Utils.shuffleArray(idx, seed); for (int r = 0; r < idx.length; ++r) { for (int i = 0; i < ncs.length; i++) { ncs[i].addNum(cs[i].at0((int) idx[r])); } } } }.doAll(fr.numCols(), fr).outputFrame(outputFrameKey, fr.names(), fr.domains()); return r; }
// GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell // (2015), Section 5.3) private Frame reconstruct( Frame orig, Frame adaptedFr, Key destination_key, boolean save_imputed, boolean reverse_transform) { final int ncols = _output._names.length; assert ncols == adaptedFr.numCols(); String prefix = "reconstr_"; // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame // Note: A is adapted to original training frame, P has columns shuffled so cats come before // nums! Frame fullFrm = new Frame(adaptedFr); Frame loadingFrm = DKV.get(_output._representation_key).get(); fullFrm.add(loadingFrm); String[][] adaptedDomme = adaptedFr.domains(); for (int i = 0; i < ncols; i++) { Vec v = fullFrm.anyVec().makeZero(); v.setDomain(adaptedDomme[i]); fullFrm.add(prefix + _output._names[i], v); } GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm); // Return the imputed training frame int x = ncols + _parms._k, y = fullFrm.numCols(); Frame f = fullFrm.extractFrame( x, y); // this will call vec_impl() and we cannot call the delete() below just yet f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs()); DKV.put(f); gs._mb.makeModelMetrics( GLRMModel.this, orig, null, null); // save error metrics based on imputed data return f; }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); Frame returningFrame; long nrows = fr.numRows(); if (asts[2] instanceof ASTNumList) { final ASTNumList nums = (ASTNumList) asts[2]; long[] rows = nums._isList ? nums.expand8Sort() : null; if (rows != null) { if (rows.length == 0) { // Empty inclusion list? } else if (rows[0] >= 0) { // Positive (inclusion) list if (rows[rows.length - 1] > nrows) throw new IllegalArgumentException("Row must be an integer from 0 to " + (nrows - 1)); } else { // Negative (exclusion) list // Invert the list to make a positive list, ignoring out-of-bounds values BitSet bs = new BitSet((int) nrows); for (int i = 0; i < rows.length; i++) { int idx = (int) (-rows[i] - 1); // The positive index if (idx >= 0 && idx < nrows) bs.set(idx); // Set column to EXCLUDE } rows = new long[(int) nrows - bs.cardinality()]; for (int i = bs.nextClearBit(0), j = 0; i < nrows; i = bs.nextClearBit(i + 1)) rows[j++] = i; } } final long[] ls = rows; returningFrame = new MRTask() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { if (nums.cnt() == 0) return; long start = cs[0].start(); long end = start + cs[0]._len; long min = ls == null ? (long) nums.min() : ls[0], max = ls == null ? (long) nums.max() - 1 : ls[ls.length - 1]; // exclusive max to inclusive max when stride == 1 // [ start, ..., end ] the chunk // 1 [] nums out left: nums.max() < start // 2 [] nums out rite: nums.min() > end // 3 [ nums ] nums run left: nums.min() < start && nums.max() <= // end // 4 [ nums ] nums run in : start <= nums.min() && nums.max() <= // end // 5 [ nums ] nums run rite: start <= nums.min() && end < // nums.max() if (!(max < start || min > end)) { // not situation 1 or 2 above long startOffset = (min > start ? min : start); // situation 4 and 5 => min > start; for (int i = (int) (startOffset - start); i < cs[0]._len; ++i) { if ((ls == null && nums.has(start + i)) || (ls != null && Arrays.binarySearch(ls, start + i) >= 0)) { for (int c = 0; c < cs.length; ++c) { if (cs[c] instanceof CStrChunk) ncs[c].addStr(cs[c], i); else if (cs[c] instanceof C16Chunk) ncs[c].addUUID(cs[c], i); else if (cs[c].isNA(i)) ncs[c].addNA(); else ncs[c].addNum(cs[c].atd(i)); } } } } } }.doAll(fr.types(), fr).outputFrame(fr.names(), fr.domains()); } else if ((asts[2] instanceof ASTNum)) { long[] rows = new long[] {(long) (((ASTNum) asts[2])._v.getNum())}; returningFrame = fr.deepSlice(rows, null); } else if ((asts[2] instanceof ASTExec) || (asts[2] instanceof ASTId)) { Frame predVec = stk.track(asts[2].exec(env)).getFrame(); if (predVec.numCols() != 1) throw new IllegalArgumentException( "Conditional Row Slicing Expression evaluated to " + predVec.numCols() + " columns. Must be a boolean Vec."); returningFrame = fr.deepSlice(predVec, null); } else throw new IllegalArgumentException( "Row slicing requires a number-list as the last argument, but found a " + asts[2].getClass()); return new ValFrame(returningFrame); }
// internal version with repeat counter // currently hardcoded to do up to 10 tries to get a row from each class, which can be impossible // for certain wrong sampling ratios private static Frame sampleFrameStratified( final Frame fr, Vec label, final float[] sampling_ratios, final long seed, final boolean debug, int count) { if (fr == null) return null; assert (label.isEnum()); assert (sampling_ratios != null && sampling_ratios.length == label.domain().length); final int labelidx = fr.find(label); // which column is the label? assert (labelidx >= 0); final boolean poisson = false; // beta feature Frame r = new MRTask2() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { final Random rng = getDeterRNG(seed + cs[0].cidx()); for (int r = 0; r < cs[0]._len; r++) { if (cs[labelidx].isNA0(r)) continue; // skip missing labels final int label = (int) cs[labelidx].at80(r); assert (sampling_ratios.length > label && label >= 0); int sampling_reps; if (poisson) { sampling_reps = Utils.getPoisson(sampling_ratios[label], rng); } else { final float remainder = sampling_ratios[label] - (int) sampling_ratios[label]; sampling_reps = (int) sampling_ratios[label] + (rng.nextFloat() < remainder ? 1 : 0); } for (int i = 0; i < ncs.length; i++) { for (int j = 0; j < sampling_reps; ++j) { ncs[i].addNum(cs[i].at0(r)); } } } } }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains()); // Confirm the validity of the distribution long[] dist = new ClassDist(r.vecs()[labelidx]).doAll(r.vecs()[labelidx]).dist(); // if there are no training labels in the test set, then there is no point in sampling the test // set if (dist == null) return fr; if (debug) { long sumdist = Utils.sum(dist); Log.info("After stratified sampling: " + sumdist + " rows."); for (int i = 0; i < dist.length; ++i) { Log.info( "Class " + r.vecs()[labelidx].domain(i) + ": count: " + dist[i] + " sampling ratio: " + sampling_ratios[i] + " actual relative frequency: " + (float) dist[i] / sumdist * dist.length); } } // Re-try if we didn't get at least one example from each class if (Utils.minValue(dist) == 0 && count < 10) { Log.info( "Re-doing stratified sampling because not all classes were represented (unlucky draw)."); r.delete(); return sampleFrameStratified(fr, label, sampling_ratios, seed + 1, debug, ++count); } // shuffle intra-chunk Frame shuffled = shuffleFramePerChunk(r, seed + 0x580FF13); r.delete(); return shuffled; }