@Override protected void checkMemoryFootPrint() { if (_model._output._ntrees == 0) return; int trees_so_far = _model._output._ntrees; // existing trees long model_mem_size = new ComputeModelSize(trees_so_far, _model._output._treeKeys).doAllNodes()._model_mem_size; _model._output._treeStats._byte_size = model_mem_size; double avg_tree_mem_size = (double) model_mem_size / trees_so_far; Log.debug( "Average tree size (for all classes): " + PrettyPrint.bytes((long) avg_tree_mem_size)); // all the compressed trees are stored on the driver node long max_mem = H2O.SELF.get_max_mem(); if (_parms._ntrees * avg_tree_mem_size > max_mem) { String msg = "The tree model will not fit in the driver node's memory (" + PrettyPrint.bytes((long) avg_tree_mem_size) + " per tree x " + _parms._ntrees + " > " + PrettyPrint.bytes(max_mem) + ") - try decreasing ntrees and/or max_depth or increasing min_rows!"; error("_ntrees", msg); cancel(msg); } }
// Compute a compressed integer buffer private byte[] bufX(long bias, int scale, int off, int log) { byte[] bs = new byte[(_len << log) + off]; int j = 0; for (int i = 0; i < _len; i++) { long le = -bias; if (_id == null || _id.length == 0 || (j < _id.length && _id[j] == i)) { if (isNA2(j)) { le = NAS[log]; } else { int x = (_xs[j] == Integer.MIN_VALUE + 1 ? 0 : _xs[j]) - scale; le += x >= 0 ? _ls[j] * PrettyPrint.pow10i(x) : _ls[j] / PrettyPrint.pow10i(-x); } ++j; } switch (log) { case 0: bs[i + off] = (byte) le; break; case 1: UnsafeUtils.set2(bs, (i << 1) + off, (short) le); break; case 2: UnsafeUtils.set4(bs, (i << 2) + off, (int) le); break; case 3: UnsafeUtils.set8(bs, (i << 3) + off, le); break; default: throw H2O.fail(); } } assert j == sparseLen() : "j = " + j + ", len = " + sparseLen() + ", len2 = " + _len + ", id[j] = " + _id[j]; return bs; }
protected void checkMemoryFootPrint() { long mem_usage = 8 /*doubles*/ * _parms._k * _train.numCols() * (_parms._standardize ? 2 : 1); long max_mem = H2O.SELF._heartbeat.get_free_mem(); if (mem_usage > max_mem) { String msg = "Centroids won't fit in the driver node's memory (" + PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem) + ") - try reducing the number of columns and/or the number of categorical factors."; error("_train", msg); cancel(msg); } }
// Compute a compressed double buffer private Chunk chunkD() { HashMap<Long, Byte> hs = new HashMap<>(CUDChunk.MAX_UNIQUES); Byte dummy = 0; final byte[] bs = MemoryManager.malloc1(_len * 8, true); int j = 0; boolean fitsInUnique = true; for (int i = 0; i < _len; ++i) { double d = 0; if (_id == null || _id.length == 0 || (j < _id.length && _id[j] == i)) { d = _ds != null ? _ds[j] : (isNA2(j) || isCategorical(j)) ? Double.NaN : _ls[j] * PrettyPrint.pow10(_xs[j]); ++j; } if (fitsInUnique) { if (hs.size() < CUDChunk.MAX_UNIQUES) // still got space hs.put( Double.doubleToLongBits(d), dummy); // store doubles as longs to avoid NaN comparison issues during extraction else fitsInUnique = (hs.size() == CUDChunk.MAX_UNIQUES) && // full, but might not need more space because of repeats hs.containsKey(Double.doubleToLongBits(d)); } UnsafeUtils.set8d(bs, 8 * i, d); } assert j == sparseLen() : "j = " + j + ", _len = " + sparseLen(); if (fitsInUnique && CUDChunk.computeByteSize(hs.size(), len()) < 0.8 * bs.length) return new CUDChunk(bs, hs, len()); else return new C8DChunk(bs); }
// Compute a sparse float buffer private byte[] bufD(final int valsz) { int log = 0; while ((1 << log) < valsz) ++log; assert (1 << log) == valsz; final int ridsz = _len >= 65535 ? 4 : 2; final int elmsz = ridsz + valsz; int off = CXDChunk._OFF; byte[] buf = MemoryManager.malloc1(off + sparseLen() * elmsz, true); for (int i = 0; i < sparseLen(); i++, off += elmsz) { if (ridsz == 2) UnsafeUtils.set2(buf, off, (short) _id[i]); else UnsafeUtils.set4(buf, off, _id[i]); final double dval = _ds == null ? isNA2(i) ? Double.NaN : _ls[i] * PrettyPrint.pow10(_xs[i]) : _ds[i]; switch (valsz) { case 4: UnsafeUtils.set4f(buf, off + ridsz, (float) dval); break; case 8: UnsafeUtils.set8d(buf, off + ridsz, dval); break; default: throw H2O.fail(); } } assert off == buf.length; return buf; }
protected void switch_to_doubles() { assert _ds == null; double[] ds = MemoryManager.malloc8d(sparseLen()); for (int i = 0; i < sparseLen(); ++i) if (isNA2(i) || isCategorical2(i)) ds[i] = Double.NaN; else ds[i] = _ls[i] * PrettyPrint.pow10(_xs[i]); _ls = null; _xs = null; _ds = ds; }
@Override public long at8_impl(int i) { if (_len != sparseLen()) { int idx = Arrays.binarySearch(_id, 0, sparseLen(), i); if (idx >= 0) i = idx; else return 0; } if (isNA2(i)) throw new RuntimeException("Attempting to access NA as integer value."); if (_ls == null) return (long) _ds[i]; return _ls[i] * PrettyPrint.pow10i(_xs[i]); }
@Override public NewChunk inflate_impl(NewChunk nc) { double dx = Math.log10(_scale); assert water.util.PrettyPrint.fitsIntoInt(dx); nc.set_sparseLen(0); nc.set_len(0); final int len = _len; for (int i = 0; i < len; i++) { int res = 0xFF & _mem[i + _OFF]; if (res == C1Chunk._NA) nc.addNA(); else nc.addNum((res + _bias), (int) dx); } return nc; }
public void addNum(long val, int exp) { if (isUUID() || isString()) addNA(); else if (_ds != null) { assert _ls == null; addNum(val * PrettyPrint.pow10(exp)); } else { if (val == 0) exp = 0; // Canonicalize zero long t; // Remove extra scaling while (exp < 0 && exp > -9999999 && (t = val / 10) * 10 == val) { val = t; exp++; } append2(val, exp); } }
@Override protected void checkMemoryFootPrint() { HeartBeat hb = H2O.SELF._heartbeat; double p = _train.degreesOfFreedom(); long mem_usage = (long) (hb._cpus_allowed * p * p * 8 /*doubles*/ * Math.log((double) _train.lastVec().nChunks()) / Math.log(2.)); // one gram per core long max_mem = hb.get_max_mem(); if (mem_usage > max_mem) { String msg = "Gram matrices (one per thread) won't fit in the driver node's memory (" + PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem) + ") - try reducing the number of columns and/or the number of categorical factors."; error("_train", msg); cancel(msg); } }
private TwoDimTable createScoringHistoryTable(KMeansModel.KMeansOutput output) { List<String> colHeaders = new ArrayList<>(); List<String> colTypes = new ArrayList<>(); List<String> colFormat = new ArrayList<>(); colHeaders.add("Timestamp"); colTypes.add("string"); colFormat.add("%s"); colHeaders.add("Duration"); colTypes.add("string"); colFormat.add("%s"); colHeaders.add("Iteration"); colTypes.add("long"); colFormat.add("%d"); colHeaders.add("Avg. Change of Std. Centroids"); colTypes.add("double"); colFormat.add("%.5f"); colHeaders.add("Within Cluster Sum Of Squares"); colTypes.add("double"); colFormat.add("%.5f"); final int rows = output._avg_centroids_chg.length; TwoDimTable table = new TwoDimTable( "Scoring History", null, new String[rows], colHeaders.toArray(new String[0]), colTypes.toArray(new String[0]), colFormat.toArray(new String[0]), ""); int row = 0; for (int i = 0; i < rows; i++) { int col = 0; assert (row < table.getRowDim()); assert (col < table.getColDim()); DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); table.set(row, col++, fmt.print(output._training_time_ms[i])); table.set(row, col++, PrettyPrint.msecs(output._training_time_ms[i] - _start_time, true)); table.set(row, col++, i); table.set(row, col++, output._avg_centroids_chg[i]); table.set(row, col++, output._history_withinss[i]); row++; } return table; }
// Compute a sparse integer buffer private byte[] bufS(final int valsz) { int log = 0; while ((1 << log) < valsz) ++log; assert valsz == 0 || (1 << log) == valsz; final int ridsz = _len >= 65535 ? 4 : 2; final int elmsz = ridsz + valsz; int off = CXIChunk._OFF; byte[] buf = MemoryManager.malloc1(off + sparseLen() * elmsz, true); for (int i = 0; i < sparseLen(); i++, off += elmsz) { if (ridsz == 2) UnsafeUtils.set2(buf, off, (short) _id[i]); else UnsafeUtils.set4(buf, off, _id[i]); if (valsz == 0) { assert _xs[i] == 0 && _ls[i] == 1; continue; } assert _xs[i] == Integer.MIN_VALUE || _xs[i] >= 0 : "unexpected exponent " + _xs[i]; // assert we have int or NA final long lval = _xs[i] == Integer.MIN_VALUE ? NAS[log] : _ls[i] * PrettyPrint.pow10i(_xs[i]); switch (valsz) { case 1: buf[off + ridsz] = (byte) lval; break; case 2: short sval = (short) lval; UnsafeUtils.set2(buf, off + ridsz, sval); break; case 4: int ival = (int) lval; UnsafeUtils.set4(buf, off + ridsz, ival); break; case 8: UnsafeUtils.set8(buf, off + ridsz, lval); break; default: throw H2O.fail(); } } assert off == buf.length; return buf; }
public Job<Frame> execImpl() { if (integer_fraction + binary_fraction + categorical_fraction > 1) throw new IllegalArgumentException( "Integer, binary and categorical fractions must add up to <= 1."); if (Math.abs(missing_fraction) > 1) throw new IllegalArgumentException("Missing fraction must be between 0 and 1."); if (Math.abs(integer_fraction) > 1) throw new IllegalArgumentException("Integer fraction must be between 0 and 1."); if (Math.abs(binary_fraction) > 1) throw new IllegalArgumentException("Binary fraction must be between 0 and 1."); if (Math.abs(binary_ones_fraction) > 1) throw new IllegalArgumentException("Binary ones fraction must be between 0 and 1."); if (Math.abs(categorical_fraction) > 1) throw new IllegalArgumentException("Categorical fraction must be between 0 and 1."); if (categorical_fraction > 0 && factors <= 1) throw new IllegalArgumentException("Factors must be larger than 2 for categorical data."); if (response_factors < 1) throw new IllegalArgumentException( "Response factors must be either 1 (real-valued response), or >=2 (factor levels)."); if (response_factors > 1024) throw new IllegalArgumentException("Response factors must be <= 1024."); if (factors > 1000000) throw new IllegalArgumentException("Number of factors must be <= 1,000,000)."); if (cols <= 0 || rows <= 0) throw new IllegalArgumentException("Must have number of rows > 0 and columns > 1."); // estimate byte size of the frame double byte_estimate = randomize ? rows * cols * (binary_fraction * 1. / 8 // bits + categorical_fraction * (factors < 128 ? 1 : factors < 32768 ? 2 : 4) + integer_fraction * (integer_range < 128 ? 1 : integer_range < 32768 ? 2 : integer_range < (1 << 31) ? 4 : 8) + (1 - integer_fraction - binary_fraction - categorical_fraction) * 8) // reals + rows * 1 // response is : 0; // all constants - should be small if (byte_estimate > H2O.CLOUD._memary[0].get_max_mem() * H2O.CLOUD.size()) throw new IllegalArgumentException( "Frame is expected to require " + PrettyPrint.bytes((long) byte_estimate) + ", won't fit into H2O's memory."); if (!randomize) { if (integer_fraction != 0 || categorical_fraction != 0) throw new IllegalArgumentException( "Cannot have integer or categorical fractions > 0 unless randomize=true."); } else { if (value != 0) throw new IllegalArgumentException( "Cannot set data to a constant value if randomize=true."); } if (_dest == null) throw new IllegalArgumentException("Destination key cannot be null."); FrameCreator fc = new FrameCreator(this, this._key); start(fc, fc.nChunks() * 5); return this; }
@Ignore @Test public void matrixVecTest() { int rows = 2048; int cols = 8192; int loops = 5; int warmup_loops = 5; long seed = 0x533D; float nnz_ratio_vec = 0.01f; // fraction of non-zeroes for vector float nnz_ratio_mat = 0.1f; // fraction of non-zeroes for matrix float[] a = new float[rows * cols]; float[] x = new float[cols]; float[] y = new float[rows]; float[] res = new float[rows]; byte[] bits = new byte[rows]; for (int row = 0; row < rows; ++row) { y[row] = 0; res[row] = 0; bits[row] = (byte) ("abcdefghijklmnopqrstuvwxyz".toCharArray()[row % 26]); } Random rng = new Random(seed); for (int col = 0; col < cols; ++col) if (rng.nextFloat() < nnz_ratio_vec) x[col] = ((float) col) / cols; for (int row = 0; row < rows; ++row) { int off = row * cols; for (int col = 0; col < cols; ++col) { if (rng.nextFloat() < nnz_ratio_mat) a[off + col] = ((float) (row + col)) / cols; } } Storage.DenseRowMatrix dra = new Storage.DenseRowMatrix(a, rows, cols); Storage.DenseColMatrix dca = new Storage.DenseColMatrix(dra, rows, cols); Storage.SparseRowMatrix sra = new Storage.SparseRowMatrix(dra, rows, cols); Storage.SparseColMatrix sca = new Storage.SparseColMatrix(dca, rows, cols); Storage.DenseVector dx = new Storage.DenseVector(x); Storage.DenseVector dy = new Storage.DenseVector(y); Storage.DenseVector dres = new Storage.DenseVector(res); Storage.SparseVector sx = new Storage.SparseVector(x); /** warmup */ System.out.println("warming up."); float sum = 0; for (int l = 0; l < warmup_loops; ++l) { gemv_naive(res, a, x, y, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv_naive(dres, dra, dx, dy, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv_row_optimized(res, a, x, y, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv(dres, dca, dx, dy, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv(dres, dra, sx, dy, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv(dres, dca, sx, dy, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv(dres, sra, sx, dy, bits); sum += res[rows / 2]; } for (int l = 0; l < warmup_loops; ++l) { gemv(dres, sca, sx, dy, bits); sum += res[rows / 2]; } /** naive version */ System.out.println("\nstarting naive."); sum = 0; long start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv_naive(res, a, x, y, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "naive time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting dense row * dense."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv_naive(dres, dra, dx, dy, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "dense row * dense time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting optimized dense row * dense."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv_row_optimized(res, a, x, y, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "optimized dense row * dense time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting dense col * dense."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv(dres, dca, dx, dy, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "dense col * dense time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting dense row * sparse."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv(dres, dra, sx, dy, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "dense row * sparse time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting dense col * sparse."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv(dres, dca, sx, dy, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "dense col * sparse time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting sparse row * sparse."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv(dres, sra, sx, dy, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "sparse row * sparse time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); System.out.println("\nstarting sparse col * sparse."); sum = 0; start = System.currentTimeMillis(); for (int l = 0; l < loops; ++l) { gemv(dres, sca, sx, dy, bits); sum += res[rows / 2]; // do something useful } System.out.println("result: " + sum + " and " + ArrayUtils.sum(res)); System.out.println( "sparse col * sparse time: " + PrettyPrint.msecs(System.currentTimeMillis() - start, true)); }
/** * Compute the actual train_samples_per_iteration size from the user-given parameter * * @param mp Model parameter (DeepLearning object) * @param numRows number of training rows * @param model DL model * @return The total number of training rows to be processed per iteration (summed over on all * nodes) */ private long computeTrainSamplesPerIteration( final DeepLearningParameters mp, final long numRows, final DeepLearningModel model) { long tspi = mp._train_samples_per_iteration; assert (tspi == 0 || tspi == -1 || tspi == -2 || tspi >= 1); if (tspi == 0 || (!mp._replicate_training_data && tspi == -1)) { tspi = numRows; if (!mp._quiet_mode) Log.info( "Setting train_samples_per_iteration (" + mp._train_samples_per_iteration + ") to one epoch: #rows (" + tspi + ")."); } else if (tspi == -1) { tspi = (mp._single_node_mode ? 1 : H2O.CLOUD.size()) * numRows; if (!mp._quiet_mode) Log.info( "Setting train_samples_per_iteration (" + mp._train_samples_per_iteration + ") to #nodes x #rows (" + tspi + ")."); } else if (tspi == -2) { // automatic tuning based on CPU speed, network speed and model size // measure cpu speed double total_gflops = 0; for (H2ONode h2o : H2O.CLOUD._memary) { HeartBeat hb = h2o._heartbeat; total_gflops += hb._gflops; // can be NaN if not yet run } if (mp._single_node_mode) total_gflops /= H2O.CLOUD.size(); if (Double.isNaN(total_gflops)) { total_gflops = Linpack.run(H2O.SELF._heartbeat._cpus_allowed) * (mp._single_node_mode ? 1 : H2O.CLOUD.size()); } assert (!Double.isNaN(total_gflops)); final long model_size = model.model_info().size(); int[] msg_sizes = new int[] { 1, (int) (model_size * 4) == (model_size * 4) ? (int) (model_size * 4) : Integer.MAX_VALUE }; double[] microseconds_collective = new double[msg_sizes.length]; NetworkTest.NetworkTester nt = new NetworkTest.NetworkTester( msg_sizes, null, microseconds_collective, model_size > 1e6 ? 1 : 5 /*repeats*/, false, true /*only collectives*/); nt.compute2(); // length of the network traffic queue based on log-tree rollup (2 log(nodes)) int network_queue_length = mp._single_node_mode || H2O.CLOUD.size() == 1 ? 1 : 2 * (int) Math.floor(Math.log(H2O.CLOUD.size()) / Math.log(2)); // heuristics double flops_overhead_per_row = 50; if (mp._activation == DeepLearningParameters.Activation.Maxout || mp._activation == DeepLearningParameters.Activation.MaxoutWithDropout) { flops_overhead_per_row *= 8; } else if (mp._activation == DeepLearningParameters.Activation.Tanh || mp._activation == DeepLearningParameters.Activation.TanhWithDropout) { flops_overhead_per_row *= 5; } // target fraction of comm vs cpu time: 5% double fraction = mp._single_node_mode || H2O.CLOUD.size() == 1 ? 1e-3 : mp._target_ratio_comm_to_comp; // one single node mode, there's no model averaging // effect, so less need to shorten the M/R // iteration // estimate the time for communication (network) and training (compute) model.time_for_communication_us = (H2O.CLOUD.size() == 1 ? 1e4 /* add 10ms for single-node */ : 1e5 /* add 100ms for multi-node MR overhead */) + network_queue_length * microseconds_collective[1]; double time_per_row_us = (flops_overhead_per_row * model_size + 10000 * model.model_info().units[0]) / (total_gflops * 1e9) / H2O.SELF._heartbeat._cpus_allowed * 1e6; assert (!Double.isNaN(time_per_row_us)); // compute the optimal number of training rows per iteration // fraction := time_comm_us / (time_comm_us + tspi * time_per_row_us) ==> tspi = // (time_comm_us/fraction - time_comm_us)/time_per_row_us tspi = (long) ((model.time_for_communication_us / fraction - model.time_for_communication_us) / time_per_row_us); tspi = Math.min( tspi, (mp._single_node_mode ? 1 : H2O.CLOUD.size()) * numRows * 10); // not more than 10x of what train_samples_per_iteration=-1 would do // If the number is close to a multiple of epochs, use that -> prettier scoring if (tspi > numRows && Math.abs(tspi % numRows) / (double) numRows < 0.2) tspi -= tspi % numRows; tspi = Math.min( tspi, (long) (mp._epochs * numRows / 10)); // limit to number of epochs desired, but at least 10 iterations // total if (H2O.CLOUD.size() == 1 || mp._single_node_mode) { tspi = Math.min( tspi, 10 * (int) (1e6 / time_per_row_us)); // in single-node mode, only run for at most 10 // seconds } tspi = Math.max(1, tspi); // at least 1 row tspi = Math.min( 100000 * H2O.CLOUD.size(), tspi); // at most 100k rows per node for initial guess - can always relax later on if (!mp._quiet_mode) { Log.info("Auto-tuning parameter 'train_samples_per_iteration':"); Log.info("Estimated compute power : " + Math.round(total_gflops * 100) / 100 + " GFlops"); Log.info( "Estimated time for comm : " + PrettyPrint.usecs((long) model.time_for_communication_us)); Log.info( "Estimated time per row : " + ((long) time_per_row_us > 0 ? PrettyPrint.usecs((long) time_per_row_us) : time_per_row_us + " usecs")); Log.info("Estimated training speed: " + (int) (1e6 / time_per_row_us) + " rows/sec"); Log.info( "Setting train_samples_per_iteration (" + mp._train_samples_per_iteration + ") to auto-tuned value: " + tspi); } } else { // limit user-given value to number of epochs desired tspi = Math.max(1, Math.min(tspi, (long) (mp._epochs * numRows))); } assert (tspi != 0 && tspi != -1 && tspi != -2 && tspi >= 1); model.tspiGuess = tspi; return tspi; }
/** * Train a Deep Learning neural net model * * @param model Input model (e.g., from initModel(), or from a previous training run) * @return Trained model */ public final DeepLearningModel trainModel(DeepLearningModel model) { Frame validScoreFrame = null; Frame train, trainScoreFrame; try { // if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some // Job's params might be uninitialized (but the restarted model's parameters are correct) if (model == null) { model = DKV.get(dest()).get(); } Log.info( "Model category: " + (_parms._autoencoder ? "Auto-Encoder" : isClassifier() ? "Classification" : "Regression")); final long model_size = model.model_info().size(); Log.info( "Number of model parameters (weights/biases): " + String.format("%,d", model_size)); model.write_lock(_job); _job.update(0, "Setting up training data..."); final DeepLearningParameters mp = model.model_info().get_params(); // temporary frames of the same "name" as the orig _train/_valid (asking the parameter's // Key, not the actual frame) // Note: don't put into DKV or they would overwrite the _train/_valid frames! Frame tra_fr = new Frame(mp._train, _train.names(), _train.vecs()); Frame val_fr = _valid != null ? new Frame(mp._valid, _valid.names(), _valid.vecs()) : null; train = tra_fr; if (model._output.isClassifier() && mp._balance_classes) { _job.update(0, "Balancing class distribution of training data..."); float[] trainSamplingFactors = new float [train .lastVec() .domain() .length]; // leave initialized to 0 -> will be filled up below if (mp._class_sampling_factors != null) { if (mp._class_sampling_factors.length != train.lastVec().domain().length) throw new IllegalArgumentException( "class_sampling_factors must have " + train.lastVec().domain().length + " elements"); trainSamplingFactors = mp._class_sampling_factors.clone(); // clone: don't modify the original } train = sampleFrameStratified( train, train.lastVec(), train.vec(model._output.weightsName()), trainSamplingFactors, (long) (mp._max_after_balance_size * train.numRows()), mp._seed, true, false); Vec l = train.lastVec(); Vec w = train.vec(model._output.weightsName()); MRUtils.ClassDist cd = new MRUtils.ClassDist(l); model._output._modelClassDist = _weights != null ? cd.doAll(l, w).rel_dist() : cd.doAll(l).rel_dist(); } model.training_rows = train.numRows(); if (_weights != null && _weights.min() == 0 && _weights.max() == 1 && _weights.isInt()) { model.training_rows = Math.round(train.numRows() * _weights.mean()); Log.warn( "Not counting " + (train.numRows() - model.training_rows) + " rows with weight=0 towards an epoch."); } Log.info("One epoch corresponds to " + model.training_rows + " training data rows."); trainScoreFrame = sampleFrame( train, mp._score_training_samples, mp._seed); // training scoring dataset is always sampled uniformly from the training // dataset if (trainScoreFrame != train) Scope.track(trainScoreFrame); if (!_parms._quiet_mode) Log.info("Number of chunks of the training data: " + train.anyVec().nChunks()); if (val_fr != null) { model.validation_rows = val_fr.numRows(); // validation scoring dataset can be sampled in multiple ways from the given validation // dataset if (model._output.isClassifier() && mp._balance_classes && mp._score_validation_sampling == DeepLearningParameters.ClassSamplingMethod.Stratified) { _job.update(0, "Sampling validation data (stratified)..."); validScoreFrame = sampleFrameStratified( val_fr, val_fr.lastVec(), val_fr.vec(model._output.weightsName()), null, mp._score_validation_samples > 0 ? mp._score_validation_samples : val_fr.numRows(), mp._seed + 1, false /* no oversampling */, false); } else { _job.update(0, "Sampling validation data..."); validScoreFrame = sampleFrame(val_fr, mp._score_validation_samples, mp._seed + 1); if (validScoreFrame != val_fr) Scope.track(validScoreFrame); } if (!_parms._quiet_mode) Log.info( "Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks()); } // Set train_samples_per_iteration size (cannot be done earlier since this depends on // whether stratified sampling is done) model.actual_train_samples_per_iteration = computeTrainSamplesPerIteration(mp, model.training_rows, model); // Determine whether shuffling is enforced if (mp._replicate_training_data && (model.actual_train_samples_per_iteration == model.training_rows * (mp._single_node_mode ? 1 : H2O.CLOUD.size())) && !mp._shuffle_training_data && H2O.CLOUD.size() > 1 && !mp._reproducible) { if (!mp._quiet_mode) Log.info( "Enabling training data shuffling, because all nodes train on the full dataset (replicated training data)."); mp._shuffle_training_data = true; } if (!mp._shuffle_training_data && model.actual_train_samples_per_iteration == model.training_rows && train.anyVec().nChunks() == 1) { if (!mp._quiet_mode) Log.info( "Enabling training data shuffling to avoid training rows in the same order over and over (no Hogwild since there's only 1 chunk)."); mp._shuffle_training_data = true; } // if (!mp._quiet_mode) Log.info("Initial model:\n" + model.model_info()); long now = System.currentTimeMillis(); model._timeLastIterationEnter = now; if (_parms._autoencoder) { _job.update(0, "Scoring null model of autoencoder..."); if (!mp._quiet_mode) Log.info("Scoring the null model of the autoencoder."); model.doScoring( trainScoreFrame, validScoreFrame, _job._key, 0, false); // get the null model reconstruction error } // put the initial version of the model into DKV model.update(_job); model.total_setup_time_ms += now - _job.start_time(); Log.info("Total setup time: " + PrettyPrint.msecs(model.total_setup_time_ms, true)); Log.info("Starting to train the Deep Learning model."); _job.update(0, "Training..."); // main loop for (; ; ) { model.iterations++; model.set_model_info( mp._epochs == 0 ? model.model_info() : H2O.CLOUD.size() > 1 && mp._replicate_training_data ? (mp._single_node_mode ? new DeepLearningTask2( _job._key, train, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAll(Key.make(H2O.SELF)) .model_info() : // replicated data + single node mode new DeepLearningTask2( _job._key, train, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAllNodes() .model_info()) : // replicated data + multi-node mode new DeepLearningTask( _job._key, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAll(train) .model_info()); // distributed data (always in multi-node mode) if (stop_requested() && !timeout()) break; // cancellation if (!model.doScoring( trainScoreFrame, validScoreFrame, _job._key, model.iterations, false)) break; // finished training (or early stopping or convergence) if (timeout()) break; // stop after scoring } // replace the model with the best model so far (if it's better) if (!stop_requested() && _parms._overwrite_with_best_model && model.actual_best_model_key != null && _parms._nfolds == 0) { DeepLearningModel best_model = DKV.getGet(model.actual_best_model_key); if (best_model != null && best_model.loss() < model.loss() && Arrays.equals(best_model.model_info().units, model.model_info().units)) { if (!_parms._quiet_mode) Log.info("Setting the model to be the best model so far (based on scoring history)."); DeepLearningModelInfo mi = best_model.model_info().deep_clone(); // Don't cheat - count full amount of training samples, since that's the amount of // training it took to train (without finding anything better) mi.set_processed_global(model.model_info().get_processed_global()); mi.set_processed_local(model.model_info().get_processed_local()); model.set_model_info(mi); model.update(_job); model.doScoring(trainScoreFrame, validScoreFrame, _job._key, model.iterations, true); assert (best_model.loss() == model.loss()); } } // store coefficient names for future use // possibly change model.model_info().data_info().coefNames(); if (!_parms._quiet_mode) { Log.info( "=============================================================================================================================================================================="); if (stop_requested()) { Log.info("Deep Learning model training was interrupted."); } else { Log.info("Finished training the Deep Learning model."); Log.info(model); } Log.info( "=============================================================================================================================================================================="); } } finally { if (model != null) { model.deleteElasticAverageModels(); model.unlock(_job); if (model.actual_best_model_key != null) { assert (model.actual_best_model_key != model._key); DKV.remove(model.actual_best_model_key); } } } return model; }
private Chunk compress2() { // Check for basic mode info: all missing or all strings or mixed stuff byte mode = type(); if (mode == Vec.T_BAD) // ALL NAs, nothing to do return new C0DChunk(Double.NaN, sparseLen()); if (mode == Vec.T_STR) return new CStrChunk(_sslen, _ss, sparseLen(), _len, _is, _isAllASCII); boolean rerun = false; if (mode == Vec.T_CAT) { for (int i = 0; i < sparseLen(); i++) if (isCategorical2(i)) _xs[i] = 0; else if (!isNA2(i)) { setNA_impl2(i); ++_naCnt; } // Smack any mismatched string/numbers } else if (mode == Vec.T_NUM) { for (int i = 0; i < sparseLen(); i++) if (isCategorical2(i)) { setNA_impl2(i); rerun = true; } } if (rerun) { _naCnt = -1; type(); } // Re-run rollups after dropping all numbers/categoricals boolean sparse = false; // sparse? treat as sparse iff we have at least MIN_SPARSE_RATIOx more zeros than nonzeros if (_sparseRatio * (_naCnt + _nzCnt) < _len) { set_sparse(_naCnt + _nzCnt); sparse = true; } else if (sparseLen() != _len) cancel_sparse(); // If the data is UUIDs there's not much compression going on if (_ds != null && _ls != null) return chunkUUID(); // cut out the easy all NaNs case if (_naCnt == _len) return new C0DChunk(Double.NaN, _len); // If the data was set8 as doubles, we do a quick check to see if it's // plain longs. If not, we give up and use doubles. if (_ds != null) { int i; // check if we can flip to ints for (i = 0; i < sparseLen(); ++i) if (!Double.isNaN(_ds[i]) && (double) (long) _ds[i] != _ds[i]) break; boolean isInteger = i == sparseLen(); boolean isConstant = !sparse || sparseLen() == 0; double constVal = 0; if (!sparse) { // check the values, sparse with some nonzeros can not be constant - has 0s and // (at least 1) nonzero constVal = _ds[0]; for (int j = 1; j < _len; ++j) if (_ds[j] != constVal) { isConstant = false; break; } } if (isConstant) return isInteger ? new C0LChunk((long) constVal, _len) : new C0DChunk(constVal, _len); if (!isInteger) return sparse ? new CXDChunk(_len, sparseLen(), 8, bufD(8)) : chunkD(); // Else flip to longs _ls = new long[_ds.length]; _xs = new int[_ds.length]; double[] ds = _ds; _ds = null; final int naCnt = _naCnt; for (i = 0; i < sparseLen(); i++) // Inject all doubles into longs if (Double.isNaN(ds[i])) setNA_impl2(i); else _ls[i] = (long) ds[i]; // setNA_impl2 will set _naCnt to -1! // we already know what the naCnt is (it did not change!) so set it back to correct value _naCnt = naCnt; } // IF (_len > _sparseLen) THEN Sparse // Check for compressed *during appends*. Here we know: // - No specials; _xs[]==0. // - No floats; _ds==null // - NZ length in _sparseLen, actual length in _len. // - Huge ratio between _len and _sparseLen, and we do NOT want to inflate to // the larger size; we need to keep it all small all the time. // - Rows in _xs // Data in some fixed-point format, not doubles // See if we can sanely normalize all the data to the same fixed-point. int xmin = Integer.MAX_VALUE; // min exponent found boolean floatOverflow = false; double min = Double.POSITIVE_INFINITY; double max = Double.NEGATIVE_INFINITY; int p10iLength = PrettyPrint.powers10i.length; long llo = Long.MAX_VALUE, lhi = Long.MIN_VALUE; int xlo = Integer.MAX_VALUE, xhi = Integer.MIN_VALUE; for (int i = 0; i < sparseLen(); i++) { if (isNA2(i)) continue; long l = _ls[i]; int x = _xs[i]; assert x != Integer.MIN_VALUE : "l = " + l + ", x = " + x; if (x == Integer.MIN_VALUE + 1) x = 0; // Replace categorical flag with no scaling assert l != 0 || x == 0 : "l == 0 while x = " + x + " ls = " + Arrays.toString(_ls); // Exponent of zero is always zero long t; // Remove extra scaling while (l != 0 && (t = l / 10) * 10 == l) { l = t; x++; } // Compute per-chunk min/max double d = l * PrettyPrint.pow10(x); if (d < min) { min = d; llo = l; xlo = x; } if (d > max) { max = d; lhi = l; xhi = x; } floatOverflow = l < Integer.MIN_VALUE + 1 || l > Integer.MAX_VALUE; xmin = Math.min(xmin, x); } if (sparse) { // sparse? then compare vs implied 0s if (min > 0) { min = 0; llo = 0; xlo = 0; } if (max < 0) { max = 0; lhi = 0; xhi = 0; } xmin = Math.min(xmin, 0); } // Constant column? if (_naCnt == 0 && (min == max)) { if (llo == lhi && xlo == 0 && xhi == 0) return new C0LChunk(llo, _len); else if ((long) min == min) return new C0LChunk((long) min, _len); else return new C0DChunk(min, _len); } // Compute min & max, as scaled integers in the xmin scale. // Check for overflow along the way boolean overflow = ((xhi - xmin) >= p10iLength) || ((xlo - xmin) >= p10iLength); long lemax = 0, lemin = 0; if (!overflow) { // Can at least get the power-of-10 without overflow long pow10 = PrettyPrint.pow10i(xhi - xmin); lemax = lhi * pow10; // Hacker's Delight, Section 2-13, checking overflow. // Note that the power-10 is always positive, so the test devolves this: if ((lemax / pow10) != lhi) overflow = true; // Note that xlo might be > xmin; e.g. { 101e-49 , 1e-48}. long pow10lo = PrettyPrint.pow10i(xlo - xmin); lemin = llo * pow10lo; if ((lemin / pow10lo) != llo) overflow = true; } // Boolean column? if (max == 1 && min == 0 && xmin == 0 && !overflow) { if (sparse) { // Very sparse? return _naCnt == 0 ? new CX0Chunk(_len, sparseLen(), bufS(0)) // No NAs, can store as sparse bitvector : new CXIChunk(_len, sparseLen(), 1, bufS(1)); // have NAs, store as sparse 1byte values } int bpv = _catCnt + _naCnt > 0 ? 2 : 1; // Bit-vector byte[] cbuf = bufB(bpv); return new CBSChunk(cbuf, cbuf[0], cbuf[1]); } final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; if (sparse) { if (fpoint) return new CXDChunk(_len, sparseLen(), 8, bufD(8)); int sz = 8; if (Short.MIN_VALUE <= min && max <= Short.MAX_VALUE) sz = 2; else if (Integer.MIN_VALUE <= min && max <= Integer.MAX_VALUE) sz = 4; return new CXIChunk(_len, sparseLen(), sz, bufS(sz)); } // Exponent scaling: replacing numbers like 1.3 with 13e-1. '13' fits in a // byte and we scale the column by 0.1. A set of numbers like // {1.2,23,0.34} then is normalized to always be represented with 2 digits // to the right: {1.20,23.00,0.34} and we scale by 100: {120,2300,34}. // This set fits in a 2-byte short. // We use exponent-scaling for bytes & shorts only; it's uncommon (and not // worth it) for larger numbers. We need to get the exponents to be // uniform, so we scale up the largest lmax by the largest scale we need // and if that fits in a byte/short - then it's worth compressing. Other // wise we just flip to a float or double representation. if (overflow || (fpoint && floatOverflow) || -35 > xmin || xmin > 35) return chunkD(); final long leRange = leRange(lemin, lemax); if (fpoint) { if ((int) lemin == lemin && (int) lemax == lemax) { if (leRange < 255) // Fits in scaled biased byte? return new C1SChunk(bufX(lemin, xmin, C1SChunk._OFF, 0), lemin, PrettyPrint.pow10(xmin)); if (leRange < 65535) { // we use signed 2B short, add -32k to the bias! long bias = 32767 + lemin; return new C2SChunk(bufX(bias, xmin, C2SChunk._OFF, 1), bias, PrettyPrint.pow10(xmin)); } } if (leRange < 4294967295l) { long bias = 2147483647l + lemin; return new C4SChunk(bufX(bias, xmin, C4SChunk._OFF, 2), bias, PrettyPrint.pow10(xmin)); } return chunkD(); } // else an integer column // Compress column into a byte if (xmin == 0 && 0 <= lemin && lemax <= 255 && ((_naCnt + _catCnt) == 0)) return new C1NChunk(bufX(0, 0, C1NChunk._OFF, 0)); if (lemin < Integer.MIN_VALUE) return new C8Chunk(bufX(0, 0, 0, 3)); if (leRange < 255) { // Span fits in a byte? if (0 <= min && max < 255) // Span fits in an unbiased byte? return new C1Chunk(bufX(0, 0, C1Chunk._OFF, 0)); return new C1SChunk(bufX(lemin, xmin, C1SChunk._OFF, 0), lemin, PrettyPrint.pow10i(xmin)); } // Compress column into a short if (leRange < 65535) { // Span fits in a biased short? if (xmin == 0 && Short.MIN_VALUE < lemin && lemax <= Short.MAX_VALUE) // Span fits in an unbiased short? return new C2Chunk(bufX(0, 0, C2Chunk._OFF, 1)); long bias = (lemin - (Short.MIN_VALUE + 1)); return new C2SChunk(bufX(bias, xmin, C2SChunk._OFF, 1), bias, PrettyPrint.pow10i(xmin)); } // Compress column into ints if (Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE) return new C4Chunk(bufX(0, 0, 0, 2)); return new C8Chunk(bufX(0, 0, 0, 3)); }
private TwoDimTable createScoringHistoryTable(SharedTreeModel.SharedTreeOutput _output) { List<String> colHeaders = new ArrayList<>(); List<String> colTypes = new ArrayList<>(); List<String> colFormat = new ArrayList<>(); colHeaders.add("Timestamp"); colTypes.add("string"); colFormat.add("%s"); colHeaders.add("Duration"); colTypes.add("string"); colFormat.add("%s"); colHeaders.add("Number of Trees"); colTypes.add("long"); colFormat.add("%d"); colHeaders.add("Training MSE"); colTypes.add("double"); colFormat.add("%.5f"); if (_output.isClassifier()) { colHeaders.add("Training LogLoss"); colTypes.add("double"); colFormat.add("%.5f"); } if (_output.getModelCategory() == ModelCategory.Binomial) { colHeaders.add("Training AUC"); colTypes.add("double"); colFormat.add("%.5f"); } if (_output.getModelCategory() == ModelCategory.Binomial || _output.getModelCategory() == ModelCategory.Multinomial) { colHeaders.add("Training Classification Error"); colTypes.add("double"); colFormat.add("%.5f"); } if (valid() != null) { colHeaders.add("Validation MSE"); colTypes.add("double"); colFormat.add("%.5f"); if (_output.isClassifier()) { colHeaders.add("Validation LogLoss"); colTypes.add("double"); colFormat.add("%.5f"); } if (_output.getModelCategory() == ModelCategory.Binomial) { colHeaders.add("Validation AUC"); colTypes.add("double"); colFormat.add("%.5f"); } if (_output.isClassifier()) { colHeaders.add("Validation Classification Error"); colTypes.add("double"); colFormat.add("%.5f"); } } int rows = 0; for (int i = 1; i < _output._scored_train.length; i++) { if (!Double.isNaN(_output._scored_train[i]._mse)) ++rows; } TwoDimTable table = new TwoDimTable( "Scoring History", null, new String[rows], colHeaders.toArray(new String[0]), colTypes.toArray(new String[0]), colFormat.toArray(new String[0]), ""); int row = 0; for (int i = 1; i < _output._scored_train.length; i++) { if (Double.isNaN(_output._scored_train[i]._mse)) continue; int col = 0; assert (row < table.getRowDim()); assert (col < table.getColDim()); DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); table.set(row, col++, fmt.print(_output._training_time_ms[i])); table.set(row, col++, PrettyPrint.msecs(_output._training_time_ms[i] - _start_time, true)); table.set(row, col++, i); ScoreKeeper st = _output._scored_train[i]; table.set(row, col++, st._mse); if (_output.isClassifier()) table.set(row, col++, st._logloss); if (_output.getModelCategory() == ModelCategory.Binomial) table.set(row, col++, st._AUC); if (_output.isClassifier()) table.set(row, col++, st._classError); if (_valid != null) { st = _output._scored_valid[i]; table.set(row, col++, st._mse); if (_output.isClassifier()) table.set(row, col++, st._logloss); if (_output.getModelCategory() == ModelCategory.Binomial) table.set(row, col++, st._AUC); if (_output.isClassifier()) table.set(row, col++, st._classError); } row++; } return table; }
/** * Create a summary table * * @return */ TwoDimTable createSummaryTable() { Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(this); long byte_size = new AutoBuffer().put(this).buf().length; TwoDimTable table = new TwoDimTable( "Status of Neuron Layers", (get_params()._diagnostics ? "" : "diagnostics disabled, ") + (!get_params()._autoencoder ? ("predicting " + _train.lastVecName() + ", ") : "") + (get_params()._autoencoder ? "auto-encoder" : _classification ? (units[units.length - 1] + "-class classification") : "regression") + ", " + get_params()._distribution + " distribution, " + get_params()._loss + " loss, " + String.format("%,d", size()) + " weights/biases, " + PrettyPrint.bytes(byte_size) + ", " + String.format("%,d", get_processed_global()) + " training samples, " + "mini-batch size " + String.format("%,d", get_params()._mini_batch_size), new String[neurons.length], new String[] { "Layer", "Units", "Type", "Dropout", "L1", "L2", "Mean Rate", "Rate RMS", "Momentum", "Mean Weight", "Weight RMS", "Mean Bias", "Bias RMS" }, new String[] { "int", "int", "string", "double", "double", "double", "double", "double", "double", "double", "double", "double", "double" }, new String[] { "%d", "%d", "%s", "%2.2f %%", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f" }, ""); for (int i = 0; i < neurons.length; ++i) { table.set(i, 0, i + 1); table.set(i, 1, neurons[i].units); table.set(i, 2, neurons[i].getClass().getSimpleName()); if (i == 0) { table.set(i, 3, neurons[i].params._input_dropout_ratio * 100); continue; } else if (i < neurons.length - 1) { if (neurons[i].params._hidden_dropout_ratios == null) { table.set(i, 3, 0); } else { table.set(i, 3, neurons[i].params._hidden_dropout_ratios[i - 1] * 100); } } table.set(i, 4, neurons[i].params._l1); table.set(i, 5, neurons[i].params._l2); table.set( i, 6, (get_params()._adaptive_rate ? mean_rate[i] : neurons[i].rate(get_processed_total()))); table.set(i, 7, (get_params()._adaptive_rate ? rms_rate[i] : 0)); table.set(i, 8, get_params()._adaptive_rate ? 0 : neurons[i].momentum(get_processed_total())); table.set(i, 9, mean_weight[i]); table.set(i, 10, rms_weight[i]); table.set(i, 11, mean_bias[i]); table.set(i, 12, rms_bias[i]); } summaryTable = table; return summaryTable; }