/** * Create a subframe from given interval of columns. * * @param startIdx index of first column (inclusive) * @param endIdx index of the last column (exclusive) * @return a new frame containing specified interval of columns */ public Frame subframe(int startIdx, int endIdx) { Frame result = new Frame( Arrays.copyOfRange(_names, startIdx, endIdx), Arrays.copyOfRange(vecs(), startIdx, endIdx)); return result; }
/** Appends a named column, keeping the last Vec as the response */ public void add(String name, Vec vec) { assert _vecs.length == 0 || anyVec().group().equals(vec.group()); final int len = _names.length; _names = Arrays.copyOf(_names, len + 1); _vecs = Arrays.copyOf(_vecs, len + 1); _keys = Arrays.copyOf(_keys, len + 1); _names[len] = name; _vecs[len] = vec; _keys[len] = vec._key; }
/** Removes a numbered column. */ public Vec remove(int idx) { int len = _names.length; if (idx < 0 || idx >= len) return null; Vec v = _vecs[idx]; System.arraycopy(_names, idx + 1, _names, idx, len - idx - 1); System.arraycopy(_vecs, idx + 1, _vecs, idx, len - idx - 1); System.arraycopy(_keys, idx + 1, _keys, idx, len - idx - 1); _names = Arrays.copyOf(_names, len - 1); _vecs = Arrays.copyOf(_vecs, len - 1); _keys = Arrays.copyOf(_keys, len - 1); if (v == _col0) _col0 = null; return v; }
/** Appends an entire Frame */ public Frame add(Frame fr) { assert anyVec().group().equals(fr.anyVec().group()); final int len0 = _names.length; final int len1 = fr._names.length; final int len = len0 + len1; _names = Arrays.copyOf(_names, len); _vecs = Arrays.copyOf(_vecs, len); _keys = Arrays.copyOf(_keys, len); System.arraycopy(fr._names, 0, _names, len0, len1); System.arraycopy(fr._vecs, 0, _vecs, len0, len1); System.arraycopy(fr._keys, 0, _keys, len0, len1); return this; }
@Override public byte[] getChunkData(int cidx) { if (cidx == _cidx0) return _bits0; if (cidx == _cidx1) return _bits1; assert cidx == _cidx0 + 1 || cidx == _cidx1 + 1; byte[] bits = _cidx0 < _cidx1 ? _bits0 : _bits1; if (_cidx0 < _cidx1) { _cidx0 = cidx; _coff0 = -1; } else { _cidx1 = cidx; _coff1 = -1; } // Read as much as the buffer will hold int off = 0; try { while (off < bits.length) { int len = _is.read(bits, off, bits.length - off); if (len == -1) break; off += len; } assert off == bits.length || _is.available() <= 0; } catch (IOException ioe) { throw new RuntimeException(ioe); } if (off == bits.length) return bits; // Final read is short; cache the short-read byte[] bits2 = (off == 0) ? null : Arrays.copyOf(bits, off); if (_cidx0 == cidx) _bits0 = bits2; else _bits1 = bits2; return bits2; }
/** * Remove given interval of columns from frame. Motivated by R intervals. * * @param startIdx - start index of column (inclusive) * @param endIdx - end index of column (exclusive) * @return an array of remove columns */ public Vec[] remove(int startIdx, int endIdx) { int len = _names.length; int nlen = len - (endIdx - startIdx); String[] names = new String[nlen]; Key[] keys = new Key[nlen]; Vec[] vecs = new Vec[nlen]; if (startIdx > 0) { System.arraycopy(_names, 0, names, 0, startIdx); System.arraycopy(_vecs, 0, vecs, 0, startIdx); System.arraycopy(_keys, 0, keys, 0, startIdx); } nlen -= startIdx; if (endIdx < _names.length + 1) { System.arraycopy(_names, endIdx, names, startIdx, nlen); System.arraycopy(_vecs, endIdx, vecs, startIdx, nlen); System.arraycopy(_keys, endIdx, keys, startIdx, nlen); } Vec[] vec = Arrays.copyOfRange(vecs(), startIdx, endIdx); _names = names; _vecs = vec; _keys = keys; _col0 = null; return vec; }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { Val v = stk.track(asts[1].exec(env)); if (v instanceof ValRow) { ValRow vv = (ValRow) v; return vv.slice(asts[2].columns(vv._names)); } Frame fr = v.getFrame(); int[] cols = asts[2].columns(fr.names()); Frame fr2 = new Frame(); if (cols.length == 0) { // Empty inclusion list? } else if (cols[0] >= 0) { // Positive (inclusion) list if (cols[cols.length - 1] > fr.numCols()) throw new IllegalArgumentException( "Column must be an integer from 0 to " + (fr.numCols() - 1)); for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]); } else { // Negative (exclusion) list fr2 = new Frame(fr); // All of them at first Arrays.sort(cols); // This loop depends on the values in sorted order for (int col : cols) if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column } return new ValFrame(fr2); }
// pretty print Matrix(2D array of doubles) public static String pprint(double[][] arr,DecimalFormat dformat) { int colDim = 0; for( double[] line : arr ) colDim = Math.max(colDim, line.length); StringBuilder sb = new StringBuilder(); int max_width = 0; int[] ilengths = new int[colDim]; Arrays.fill(ilengths, -1); for( double[] line : arr ) { for( int c = 0; c < line.length; ++c ) { double d = line[c]; String dStr = dformat.format(d); if( dStr.indexOf('.') == -1 ) dStr += ".0"; ilengths[c] = Math.max(ilengths[c], dStr.indexOf('.')); int prefix = (d >= 0 ? 1 : 2); max_width = Math.max(dStr.length() + prefix, max_width); } } for( double[] line : arr ) { for( int c = 0; c < line.length; ++c ) { double d = line[c]; String dStr = dformat.format(d); if( dStr.indexOf('.') == -1 ) dStr += ".0"; for( int x = dStr.indexOf('.'); x < ilengths[c] + 1; ++x ) sb.append(' '); sb.append(dStr); if( dStr.indexOf('.') == -1 ) sb.append('.'); for( int i = dStr.length() - Math.max(0, dStr.indexOf('.')); i <= 5; ++i ) sb.append('0'); } sb.append("\n"); } return sb.toString(); }
/** Removes a numbered column. */ public Vec[] remove(int[] idxs) { for (int i : idxs) if (i < 0 || i > _vecs.length) throw new ArrayIndexOutOfBoundsException(); Arrays.sort(idxs); Vec[] res = new Vec[idxs.length]; Vec[] rem = new Vec[_vecs.length - idxs.length]; String[] names = new String[rem.length]; Key[] keys = new Key[rem.length]; int j = 0; int k = 0; int l = 0; for (int i = 0; i < _vecs.length; ++i) if (j < idxs.length && i == idxs[j]) { ++j; res[k++] = _vecs[i]; } else { rem[l] = _vecs[i]; names[l] = _names[i]; keys[l] = _keys[i]; ++l; } _vecs = rem; _names = names; _keys = keys; assert l == rem.length && k == idxs.length; return res; }
static public int[] difference(int a[], int b[]) { int[] r = new int[a.length]; int cnt = 0; for (int i=0; i<a.length; i++) { if (!contains(b, a[i])) r[cnt++] = a[i]; } return Arrays.copyOf(r, cnt); }
@Override ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); double frac = asts[2].exec(env).getNum(); double nrow = fr.numRows() * frac; Vec vecs[] = fr.vecs(); long[] idxs = new long[fr.numCols()]; int j = 0; for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i; Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec()); return new ValFrame(new Frame(vec)); }
public static byte [] unzipBytes(byte [] bs, Compression cmp) { InputStream is = null; int off = 0; try { switch(cmp) { case NONE: // No compression return bs; case ZIP: { ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(bs)); ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry // There is at least one entry in zip file and it is not a directory. if( ze != null && !ze.isDirectory() ) { is = zis; break; } zis.close(); return bs; // Don't crash, ignore file if cannot unzip } case GZIP: is = new GZIPInputStream(new ByteArrayInputStream(bs)); break; default: assert false:"cmp = " + cmp; } // If reading from a compressed stream, estimate we can read 2x uncompressed assert( is != null ):"is is NULL, cmp = " + cmp; bs = new byte[bs.length * 2]; // Now read from the (possibly compressed) stream while( off < bs.length ) { int len = is.read(bs, off, bs.length - off); if( len < 0 ) break; off += len; if( off == bs.length ) { // Dataset is uncompressing alot! Need more space... if( bs.length >= ValueArray.CHUNK_SZ ) break; // Already got enough bs = Arrays.copyOf(bs, bs.length * 2); } } } catch( IOException ioe ) { // Stop at any io error Log.err(ioe); } finally { Utils.close(is); } return bs; }
public static String sampleToString(int[] val, int max) { if (val == null || val.length < max) return Arrays.toString(val); StringBuilder b = new StringBuilder(); b.append('['); max -= 10; int valMax = val.length -1; for (int i = 0; ; i++) { b.append(val[i]); if (i == max) { b.append(", ..."); i = val.length - 10; } if ( i == valMax) { return b.append(']').toString(); } b.append(", "); } }
public static String join(char sep, Object[] array) { return join(sep, Arrays.asList(array)); }
public static double[] append(double[] a, double e) { a = Arrays.copyOf(a,a.length+1); a[a.length-1] = e; return a; }
public static int[] remove(int[] a, int i) { int[] tmp = Arrays.copyOf(a,a.length-1); System.arraycopy(a,i+1,tmp,i,tmp.length-i); return tmp; }
/** * Train a Deep Learning neural net model * * @param model Input model (e.g., from initModel(), or from a previous training run) * @return Trained model */ public final DeepLearningModel trainModel(DeepLearningModel model) { Frame validScoreFrame = null; Frame train, trainScoreFrame; try { // if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some // Job's params might be uninitialized (but the restarted model's parameters are correct) if (model == null) { model = DKV.get(dest()).get(); } Log.info( "Model category: " + (_parms._autoencoder ? "Auto-Encoder" : isClassifier() ? "Classification" : "Regression")); final long model_size = model.model_info().size(); Log.info( "Number of model parameters (weights/biases): " + String.format("%,d", model_size)); model.write_lock(_job); _job.update(0, "Setting up training data..."); final DeepLearningParameters mp = model.model_info().get_params(); // temporary frames of the same "name" as the orig _train/_valid (asking the parameter's // Key, not the actual frame) // Note: don't put into DKV or they would overwrite the _train/_valid frames! Frame tra_fr = new Frame(mp._train, _train.names(), _train.vecs()); Frame val_fr = _valid != null ? new Frame(mp._valid, _valid.names(), _valid.vecs()) : null; train = tra_fr; if (model._output.isClassifier() && mp._balance_classes) { _job.update(0, "Balancing class distribution of training data..."); float[] trainSamplingFactors = new float [train .lastVec() .domain() .length]; // leave initialized to 0 -> will be filled up below if (mp._class_sampling_factors != null) { if (mp._class_sampling_factors.length != train.lastVec().domain().length) throw new IllegalArgumentException( "class_sampling_factors must have " + train.lastVec().domain().length + " elements"); trainSamplingFactors = mp._class_sampling_factors.clone(); // clone: don't modify the original } train = sampleFrameStratified( train, train.lastVec(), train.vec(model._output.weightsName()), trainSamplingFactors, (long) (mp._max_after_balance_size * train.numRows()), mp._seed, true, false); Vec l = train.lastVec(); Vec w = train.vec(model._output.weightsName()); MRUtils.ClassDist cd = new MRUtils.ClassDist(l); model._output._modelClassDist = _weights != null ? cd.doAll(l, w).rel_dist() : cd.doAll(l).rel_dist(); } model.training_rows = train.numRows(); if (_weights != null && _weights.min() == 0 && _weights.max() == 1 && _weights.isInt()) { model.training_rows = Math.round(train.numRows() * _weights.mean()); Log.warn( "Not counting " + (train.numRows() - model.training_rows) + " rows with weight=0 towards an epoch."); } Log.info("One epoch corresponds to " + model.training_rows + " training data rows."); trainScoreFrame = sampleFrame( train, mp._score_training_samples, mp._seed); // training scoring dataset is always sampled uniformly from the training // dataset if (trainScoreFrame != train) Scope.track(trainScoreFrame); if (!_parms._quiet_mode) Log.info("Number of chunks of the training data: " + train.anyVec().nChunks()); if (val_fr != null) { model.validation_rows = val_fr.numRows(); // validation scoring dataset can be sampled in multiple ways from the given validation // dataset if (model._output.isClassifier() && mp._balance_classes && mp._score_validation_sampling == DeepLearningParameters.ClassSamplingMethod.Stratified) { _job.update(0, "Sampling validation data (stratified)..."); validScoreFrame = sampleFrameStratified( val_fr, val_fr.lastVec(), val_fr.vec(model._output.weightsName()), null, mp._score_validation_samples > 0 ? mp._score_validation_samples : val_fr.numRows(), mp._seed + 1, false /* no oversampling */, false); } else { _job.update(0, "Sampling validation data..."); validScoreFrame = sampleFrame(val_fr, mp._score_validation_samples, mp._seed + 1); if (validScoreFrame != val_fr) Scope.track(validScoreFrame); } if (!_parms._quiet_mode) Log.info( "Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks()); } // Set train_samples_per_iteration size (cannot be done earlier since this depends on // whether stratified sampling is done) model.actual_train_samples_per_iteration = computeTrainSamplesPerIteration(mp, model.training_rows, model); // Determine whether shuffling is enforced if (mp._replicate_training_data && (model.actual_train_samples_per_iteration == model.training_rows * (mp._single_node_mode ? 1 : H2O.CLOUD.size())) && !mp._shuffle_training_data && H2O.CLOUD.size() > 1 && !mp._reproducible) { if (!mp._quiet_mode) Log.info( "Enabling training data shuffling, because all nodes train on the full dataset (replicated training data)."); mp._shuffle_training_data = true; } if (!mp._shuffle_training_data && model.actual_train_samples_per_iteration == model.training_rows && train.anyVec().nChunks() == 1) { if (!mp._quiet_mode) Log.info( "Enabling training data shuffling to avoid training rows in the same order over and over (no Hogwild since there's only 1 chunk)."); mp._shuffle_training_data = true; } // if (!mp._quiet_mode) Log.info("Initial model:\n" + model.model_info()); long now = System.currentTimeMillis(); model._timeLastIterationEnter = now; if (_parms._autoencoder) { _job.update(0, "Scoring null model of autoencoder..."); if (!mp._quiet_mode) Log.info("Scoring the null model of the autoencoder."); model.doScoring( trainScoreFrame, validScoreFrame, _job._key, 0, false); // get the null model reconstruction error } // put the initial version of the model into DKV model.update(_job); model.total_setup_time_ms += now - _job.start_time(); Log.info("Total setup time: " + PrettyPrint.msecs(model.total_setup_time_ms, true)); Log.info("Starting to train the Deep Learning model."); _job.update(0, "Training..."); // main loop for (; ; ) { model.iterations++; model.set_model_info( mp._epochs == 0 ? model.model_info() : H2O.CLOUD.size() > 1 && mp._replicate_training_data ? (mp._single_node_mode ? new DeepLearningTask2( _job._key, train, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAll(Key.make(H2O.SELF)) .model_info() : // replicated data + single node mode new DeepLearningTask2( _job._key, train, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAllNodes() .model_info()) : // replicated data + multi-node mode new DeepLearningTask( _job._key, model.model_info(), rowFraction(train, mp, model), model.iterations) .doAll(train) .model_info()); // distributed data (always in multi-node mode) if (stop_requested() && !timeout()) break; // cancellation if (!model.doScoring( trainScoreFrame, validScoreFrame, _job._key, model.iterations, false)) break; // finished training (or early stopping or convergence) if (timeout()) break; // stop after scoring } // replace the model with the best model so far (if it's better) if (!stop_requested() && _parms._overwrite_with_best_model && model.actual_best_model_key != null && _parms._nfolds == 0) { DeepLearningModel best_model = DKV.getGet(model.actual_best_model_key); if (best_model != null && best_model.loss() < model.loss() && Arrays.equals(best_model.model_info().units, model.model_info().units)) { if (!_parms._quiet_mode) Log.info("Setting the model to be the best model so far (based on scoring history)."); DeepLearningModelInfo mi = best_model.model_info().deep_clone(); // Don't cheat - count full amount of training samples, since that's the amount of // training it took to train (without finding anything better) mi.set_processed_global(model.model_info().get_processed_global()); mi.set_processed_local(model.model_info().get_processed_local()); model.set_model_info(mi); model.update(_job); model.doScoring(trainScoreFrame, validScoreFrame, _job._key, model.iterations, true); assert (best_model.loss() == model.loss()); } } // store coefficient names for future use // possibly change model.model_info().data_info().coefNames(); if (!_parms._quiet_mode) { Log.info( "=============================================================================================================================================================================="); if (stop_requested()) { Log.info("Deep Learning model training was interrupted."); } else { Log.info("Finished training the Deep Learning model."); Log.info(model); } Log.info( "=============================================================================================================================================================================="); } } finally { if (model != null) { model.deleteElasticAverageModels(); model.unlock(_job); if (model.actual_best_model_key != null) { assert (model.actual_best_model_key != model._key); DKV.remove(model.actual_best_model_key); } } } return model; }
public static float[] div(float[] nums, float n) { assert !Float.isInfinite(n) : "Trying to divide " + Arrays.toString(nums) + " by " + n; // Almost surely not what you want for (int i=0; i<nums.length; i++) nums[i] = nums[i] / n; return nums; }
public static <T> T[] subarray(T[] a, int off, int len) { return Arrays.copyOfRange(a,off,off+len); }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Execute all args. Find a canonical frame; all Frames must look like this one. // Each argument turns into either a Frame (whose rows are entirely // inlined) or a scalar (which is replicated across as a single row). Frame fr = null; // Canonical Frame; all frames have the same column count, types and names int nchks = 0; // Total chunks Val vals[] = new Val[asts.length]; // Computed AST results for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { fr = vals[i].getFrame(); nchks += fr.anyVec().nChunks(); // Total chunks } else nchks++; // One chunk per scalar } // No Frame, just a pile-o-scalars? Vec zz = null; // The zero-length vec for the zero-frame frame if (fr == null) { // Zero-length, 1-column, default name fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)}); if (asts.length == 1) return new ValFrame(fr); } // Verify all Frames are the same columns, names, and types. Domains can vary, and will be the // union final Frame frs[] = new Frame[asts.length]; // Input frame final byte[] types = fr.types(); // Column types final int ncols = fr.numCols(); final long[] espc = new long[nchks + 1]; // Compute a new layout! int coffset = 0; for (int i = 1; i < asts.length; i++) { Val val = vals[i]; // Save values computed for pass 2 Frame fr0 = val.isFrame() ? val.getFrame() // Scalar: auto-expand into a 1-row frame : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols()))); // Check that all frames are compatible if (fr.numCols() != fr0.numCols()) throw new IllegalArgumentException( "rbind frames must have all the same columns, found " + fr.numCols() + " and " + fr0.numCols() + " columns."); if (!Arrays.deepEquals(fr._names, fr0._names)) throw new IllegalArgumentException( "rbind frames must have all the same column names, found " + Arrays.toString(fr._names) + " and " + Arrays.toString(fr0._names)); if (!Arrays.equals(types, fr0.types())) throw new IllegalArgumentException( "rbind frames must have all the same column types, found " + Arrays.toString(types) + " and " + Arrays.toString(fr0.types())); frs[i] = fr0; // Save frame // Roll up the ESPC row counts long roffset = espc[coffset]; long[] espc2 = fr0.anyVec().espc(); for (int j = 1; j < espc2.length; j++) // Roll up the row counts espc[coffset + j] = (roffset + espc2[j]); coffset += espc2.length - 1; // Chunk offset } if (zz != null) zz.remove(); // build up the new domains for each vec HashMap<String, Integer>[] dmap = new HashMap[types.length]; String[][] domains = new String[types.length][]; int[][][] cmaps = new int[types.length][][]; for (int k = 0; k < types.length; ++k) { dmap[k] = new HashMap<>(); int c = 0; byte t = types[k]; if (t == Vec.T_CAT) { int[][] maps = new int[frs.length][]; for (int i = 1; i < frs.length; i++) { maps[i] = new int[frs[i].vec(k).domain().length]; for (int j = 0; j < maps[i].length; j++) { String s = frs[i].vec(k).domain()[j]; if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++); else maps[i][j] = dmap[k].get(s); } } cmaps[k] = maps; } else { cmaps[k] = new int[frs.length][]; } domains[k] = c == 0 ? null : new String[c]; for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey(); } // Now make Keys for the new Vecs Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols()); Vec[] vecs = new Vec[fr.numCols()]; int rowLayout = Vec.ESPC.rowLayout(keys[0], espc); for (int i = 0; i < vecs.length; i++) vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]); // Do the row-binds column-by-column. // Switch to F/J thread for continuations ParallelRbinds t; H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join(); return new ValFrame(new Frame(fr.names(), t._vecs)); }
/** * Train a Deep Learning model, assumes that all members are populated If checkpoint == null, * then start training a new model, otherwise continue from a checkpoint */ public final void buildModel() { DeepLearningModel cp = null; if (_parms._checkpoint == null) { cp = new DeepLearningModel( dest(), _parms, new DeepLearningModel.DeepLearningModelOutput(DeepLearning.this), _train, _valid, nclasses()); cp.model_info().initializeMembers(); } else { final DeepLearningModel previous = DKV.getGet(_parms._checkpoint); if (previous == null) throw new IllegalArgumentException("Checkpoint not found."); Log.info("Resuming from checkpoint."); _job.update(0, "Resuming from checkpoint"); if (isClassifier() != previous._output.isClassifier()) throw new H2OIllegalArgumentException( "Response type must be the same as for the checkpointed model."); if (isSupervised() != previous._output.isSupervised()) throw new H2OIllegalArgumentException( "Model type must be the same as for the checkpointed model."); // check the user-given arguments for consistency DeepLearningParameters oldP = previous._parms; // sanitized parameters for checkpointed model DeepLearningParameters newP = _parms; // user-given parameters for restart DeepLearningParameters oldP2 = (DeepLearningParameters) oldP.clone(); DeepLearningParameters newP2 = (DeepLearningParameters) newP.clone(); DeepLearningParameters.Sanity.modifyParms( oldP, oldP2, nclasses()); // sanitize the user-given parameters DeepLearningParameters.Sanity.modifyParms( newP, newP2, nclasses()); // sanitize the user-given parameters DeepLearningParameters.Sanity.checkpoint(oldP2, newP2); DataInfo dinfo; try { // PUBDEV-2513: Adapt _train and _valid (in-place) to match the frames that were used for // the previous model // This can add or remove dummy columns (can happen if the dataset is sparse and datasets // have different non-const columns) for (String st : previous.adaptTestForTrain(_train, true, false)) Log.warn(st); for (String st : previous.adaptTestForTrain(_valid, true, false)) Log.warn(st); dinfo = makeDataInfo(_train, _valid, _parms, nclasses()); DKV.put(dinfo); cp = new DeepLearningModel(dest(), _parms, previous, false, dinfo); cp.write_lock(_job); if (!Arrays.equals(cp._output._names, previous._output._names)) { throw new H2OIllegalArgumentException( "The columns of the training data must be the same as for the checkpointed model. Check ignored columns (or disable ignore_const_cols)."); } if (!Arrays.deepEquals(cp._output._domains, previous._output._domains)) { throw new H2OIllegalArgumentException( "Categorical factor levels of the training data must be the same as for the checkpointed model."); } if (dinfo.fullN() != previous.model_info().data_info().fullN()) { throw new H2OIllegalArgumentException( "Total number of predictors is different than for the checkpointed model."); } if (_parms._epochs <= previous.epoch_counter) { throw new H2OIllegalArgumentException( "Total number of epochs must be larger than the number of epochs already trained for the checkpointed model (" + previous.epoch_counter + ")."); } // these are the mutable parameters that are to be used by the model (stored in // model_info._parms) final DeepLearningParameters actualNewP = cp.model_info() .get_params(); // actually used parameters for model building (defaults filled in, // etc.) assert (actualNewP != previous.model_info().get_params()); assert (actualNewP != newP); assert (actualNewP != oldP); DeepLearningParameters.Sanity.update(actualNewP, newP, nclasses()); Log.info( "Continuing training after " + String.format("%.3f", previous.epoch_counter) + " epochs from the checkpointed model."); cp.update(_job); } catch (H2OIllegalArgumentException ex) { if (cp != null) { cp.unlock(_job); cp.delete(); cp = null; } throw ex; } finally { if (cp != null) cp.unlock(_job); } } trainModel(cp); // clean up, but don't delete weights and biases if user asked for export List<Key> keep = new ArrayList<>(); try { if (_parms._export_weights_and_biases && cp._output.weights != null && cp._output.biases != null) { for (Key k : Arrays.asList(cp._output.weights)) { keep.add(k); for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) { keep.add(vk._key); } } for (Key k : Arrays.asList(cp._output.biases)) { keep.add(k); for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) { keep.add(vk._key); } } } } finally { Scope.exit(keep.toArray(new Key[keep.size()])); } }
public static long[][][] append(long[][][] a, long[][] e) { a = Arrays.copyOf(a,a.length+1); a[a.length-1] = e; return a; }
@SuppressWarnings("fallthrough") @Override public final DataOut parallelParse( int cidx, final CustomParser.DataIn din, final CustomParser.DataOut dout) { ValueString _str = new ValueString(); byte[] bits = din.getChunkData(cidx); if (bits == null) return dout; int offset = din.getChunkDataStart(cidx); // General cursor into the giant array of bytes final byte[] bits0 = bits; // Bits for chunk0 boolean firstChunk = true; // Have not rolled into the 2nd chunk byte[] bits1 = null; // Bits for chunk1, loaded lazily. // Starting state. Are we skipping the first (partial) line, or not? Skip // a header line, or a partial line if we're in the 2nd and later chunks. int state = (_setup._header || cidx > 0) ? SKIP_LINE : WHITESPACE_BEFORE_TOKEN; // If handed a skipping offset, then it points just past the prior partial line. if (offset >= 0) state = WHITESPACE_BEFORE_TOKEN; else offset = 0; // Else start skipping at the start int quotes = 0; long number = 0; int exp = 0; int sgn_exp = 1; boolean decimal = false; int fractionDigits = 0; int numStart = 0; int tokenStart = 0; // used for numeric token to backtrace if not successful int colIdx = 0; byte c = bits[offset]; // skip comments for the first chunk (or if not a chunk) if (cidx == 0) { while (c == '#' || c == '@' /*also treat as comments leading '@' from ARFF format*/) { while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset] != CHAR_LF)) ++offset; if ((offset + 1 < bits.length) && (bits[offset] == CHAR_CR) && (bits[offset + 1] == CHAR_LF)) ++offset; ++offset; if (offset >= bits.length) return dout; c = bits[offset]; } } dout.newLine(); MAIN_LOOP: while (true) { NEXT_CHAR: switch (state) { // --------------------------------------------------------------------- case SKIP_LINE: if (isEOL(c)) { state = EOL; } else { break NEXT_CHAR; } continue MAIN_LOOP; // --------------------------------------------------------------------- case EXPECT_COND_LF: state = POSSIBLE_EMPTY_LINE; if (c == CHAR_LF) break NEXT_CHAR; continue MAIN_LOOP; // --------------------------------------------------------------------- case STRING: if (c == quotes) { state = COND_QUOTE; break NEXT_CHAR; } if (!isEOL(c) && ((quotes != 0) || (c != CHAR_SEPARATOR))) { _str.addChar(); break NEXT_CHAR; } // fallthrough to STRING_END // --------------------------------------------------------------------- case STRING_END: if ((c != CHAR_SEPARATOR) && (c == CHAR_SPACE)) break NEXT_CHAR; // we have parsed the string enum correctly if ((_str.get_off() + _str.get_length()) > _str.get_buf().length) { // crossing chunk boundary assert _str.get_buf() != bits; _str.addBuff(bits); } dout.addStrCol(colIdx, _str); _str.set(null, 0, 0); ++colIdx; state = SEPARATOR_OR_EOL; // fallthrough to SEPARATOR_OR_EOL // --------------------------------------------------------------------- case SEPARATOR_OR_EOL: if (c == CHAR_SEPARATOR) { state = WHITESPACE_BEFORE_TOKEN; break NEXT_CHAR; } if (c == CHAR_SPACE) break NEXT_CHAR; // fallthrough to EOL // --------------------------------------------------------------------- case EOL: if (quotes != 0) { System.err.println( "Unmatched quote char " + ((char) quotes) + " " + (((_str.get_length() + 1) < offset && _str.get_off() > 0) ? new String(Arrays.copyOfRange(bits, _str.get_off() - 1, offset)) : "")); dout.invalidLine("Unmatched quote char " + ((char) quotes)); colIdx = 0; quotes = 0; } else if (colIdx != 0) { dout.newLine(); colIdx = 0; } state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE; if (!firstChunk) break MAIN_LOOP; // second chunk only does the first row break NEXT_CHAR; // --------------------------------------------------------------------- case POSSIBLE_CURRENCY: if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEPARATOR) || (c == '+')) { state = TOKEN; } else { _str.set(bits, offset - 1, 0); _str.addChar(); if (c == quotes) { state = COND_QUOTE; break NEXT_CHAR; } if ((quotes != 0) || ((!isEOL(c) && (c != CHAR_SEPARATOR)))) { state = STRING; } else { state = STRING_END; } } continue MAIN_LOOP; // --------------------------------------------------------------------- case POSSIBLE_EMPTY_LINE: if (isEOL(c)) { if (c == CHAR_CR) state = EXPECT_COND_LF; break NEXT_CHAR; } state = WHITESPACE_BEFORE_TOKEN; // fallthrough to WHITESPACE_BEFORE_TOKEN // --------------------------------------------------------------------- case WHITESPACE_BEFORE_TOKEN: if (c == CHAR_SPACE || (c == CHAR_TAB && CHAR_TAB != CHAR_SEPARATOR)) { break NEXT_CHAR; } else if (c == CHAR_SEPARATOR) { // we have empty token, store as NaN dout.addInvalidCol(colIdx++); break NEXT_CHAR; } else if (isEOL(c)) { dout.addInvalidCol(colIdx++); state = EOL; continue MAIN_LOOP; } // fallthrough to COND_QUOTED_TOKEN // --------------------------------------------------------------------- case COND_QUOTED_TOKEN: state = TOKEN; if (CHAR_SEPARATOR != HIVE_SEP && // Only allow quoting in CSV not Hive files ((_setup._singleQuotes && c == CHAR_SINGLE_QUOTE) || (c == CHAR_DOUBLE_QUOTE))) { assert (quotes == 0); quotes = c; break NEXT_CHAR; } // fallthrough to TOKEN // --------------------------------------------------------------------- case TOKEN: if (dout.isString(colIdx)) { // Forced already to a string col? state = STRING; // Do not attempt a number parse, just do a string parse _str.set(bits, offset, 0); continue MAIN_LOOP; } else if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEPARATOR) || (c == '+')) { state = NUMBER; number = 0; fractionDigits = 0; decimal = false; numStart = offset; tokenStart = offset; if (c == '-') { exp = -1; ++numStart; break NEXT_CHAR; } else if (c == '+') { exp = 1; ++numStart; break NEXT_CHAR; } else { exp = 1; } // fallthrough } else if (c == '$') { state = POSSIBLE_CURRENCY; break NEXT_CHAR; } else { state = STRING; _str.set(bits, offset, 0); continue MAIN_LOOP; } // fallthrough to NUMBER // --------------------------------------------------------------------- case NUMBER: if ((c >= '0') && (c <= '9')) { number = (number * 10) + (c - '0'); if (number >= LARGEST_DIGIT_NUMBER) state = NUMBER_SKIP; break NEXT_CHAR; } else if (c == CHAR_DECIMAL_SEPARATOR) { ++numStart; state = NUMBER_FRACTION; fractionDigits = offset; decimal = true; break NEXT_CHAR; } else if ((c == 'e') || (c == 'E')) { ++numStart; state = NUMBER_EXP_START; sgn_exp = 1; break NEXT_CHAR; } if (exp == -1) { number = -number; } exp = 0; // fallthrough to COND_QUOTED_NUMBER_END // --------------------------------------------------------------------- case COND_QUOTED_NUMBER_END: if (c == quotes) { state = NUMBER_END; quotes = 0; break NEXT_CHAR; } // fallthrough NUMBER_END case NUMBER_END: if (c == CHAR_SEPARATOR && quotes == 0) { exp = exp - fractionDigits; dout.addNumCol(colIdx, number, exp); ++colIdx; // do separator state here too state = WHITESPACE_BEFORE_TOKEN; break NEXT_CHAR; } else if (isEOL(c)) { exp = exp - fractionDigits; dout.addNumCol(colIdx, number, exp); // do EOL here for speedup reasons colIdx = 0; dout.newLine(); state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE; if (!firstChunk) break MAIN_LOOP; // second chunk only does the first row break NEXT_CHAR; } else if ((c == '%')) { state = NUMBER_END; exp -= 2; break NEXT_CHAR; } else if ((c != CHAR_SEPARATOR) && ((c == CHAR_SPACE) || (c == CHAR_TAB))) { state = NUMBER_END; break NEXT_CHAR; } else { state = STRING; offset = tokenStart - 1; _str.set(bits, tokenStart, 0); break NEXT_CHAR; // parse as String token now } // --------------------------------------------------------------------- case NUMBER_SKIP: ++numStart; if ((c >= '0') && (c <= '9')) { break NEXT_CHAR; } else if (c == CHAR_DECIMAL_SEPARATOR) { state = NUMBER_SKIP_NO_DOT; break NEXT_CHAR; } else if ((c == 'e') || (c == 'E')) { state = NUMBER_EXP_START; sgn_exp = 1; break NEXT_CHAR; } state = COND_QUOTED_NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_SKIP_NO_DOT: ++numStart; if ((c >= '0') && (c <= '9')) { break NEXT_CHAR; } else if ((c == 'e') || (c == 'E')) { state = NUMBER_EXP_START; sgn_exp = 1; break NEXT_CHAR; } state = COND_QUOTED_NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_FRACTION: if ((c >= '0') && (c <= '9')) { if (number >= LARGEST_DIGIT_NUMBER) { if (decimal) fractionDigits = offset - 1 - fractionDigits; state = NUMBER_SKIP; } else { number = (number * 10) + (c - '0'); } break NEXT_CHAR; } else if ((c == 'e') || (c == 'E')) { ++numStart; if (decimal) fractionDigits = offset - 1 - fractionDigits; state = NUMBER_EXP_START; sgn_exp = 1; break NEXT_CHAR; } state = COND_QUOTED_NUMBER_END; if (decimal) fractionDigits = offset - fractionDigits - 1; if (exp == -1) { number = -number; } exp = 0; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_EXP_START: if (exp == -1) { number = -number; } exp = 0; if (c == '-') { ++numStart; sgn_exp *= -1; break NEXT_CHAR; } else if (c == '+') { ++numStart; break NEXT_CHAR; } if ((c < '0') || (c > '9')) { state = STRING; offset = tokenStart - 1; _str.set(bits, tokenStart, 0); break NEXT_CHAR; // parse as String token now } state = NUMBER_EXP; // fall through to NUMBER_EXP // --------------------------------------------------------------------- case NUMBER_EXP: if ((c >= '0') && (c <= '9')) { ++numStart; exp = (exp * 10) + (c - '0'); break NEXT_CHAR; } exp *= sgn_exp; state = COND_QUOTED_NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case COND_QUOTE: if (c == quotes) { _str.addChar(); _str.skipChar(); state = STRING; break NEXT_CHAR; } else { quotes = 0; state = STRING_END; continue MAIN_LOOP; } // --------------------------------------------------------------------- default: assert (false) : " We have wrong state " + state; } // end NEXT_CHAR ++offset; // do not need to adjust for offset increase here - the offset is set to // tokenStart-1! if (offset < 0) { // Offset is negative? assert !firstChunk; // Caused by backing up from 2nd chunk into 1st chunk firstChunk = true; bits = bits0; offset += bits.length; _str.set(bits, offset, 0); } else if (offset >= bits.length) { // Off end of 1st chunk? Parse into 2nd chunk // Attempt to get more data. if (firstChunk && bits1 == null) bits1 = din.getChunkData(cidx + 1); // if we can't get further we might have been the last one and we must // commit the latest guy if we had one. if (!firstChunk || bits1 == null) { // No more data available or allowed // If we are mid-parse of something, act like we saw a LF to end the // current token. if ((state != EXPECT_COND_LF) && (state != POSSIBLE_EMPTY_LINE)) { c = CHAR_LF; continue MAIN_LOOP; } break MAIN_LOOP; // Else we are just done } // Now parsing in the 2nd chunk. All offsets relative to the 2nd chunk start. firstChunk = false; numStart -= bits.length; if (state == NUMBER_FRACTION) fractionDigits -= bits.length; offset -= bits.length; tokenStart -= bits.length; bits = bits1; // Set main parsing loop bits if (bits[0] == CHAR_LF && state == EXPECT_COND_LF) break MAIN_LOOP; // when the first character we see is a line end } c = bits[offset]; if (isEOL(c) && state != COND_QUOTE && quotes != 0) // quoted string having newline character => fail the line! state = EOL; } // end MAIN_LOOP if (colIdx == 0) dout.rollbackLine(); // If offset is still validly within the buffer, save it so the next pass // can start from there. if (offset + 1 < bits.length) { if (state == EXPECT_COND_LF && bits[offset + 1] == CHAR_LF) offset++; if (offset + 1 < bits.length) din.setChunkDataStart(cidx + 1, offset + 1); } return dout; }
public static <T> T[] append(T[] a, T... b) { if( a==null ) return b; T[] tmp = Arrays.copyOf(a,a.length+b.length); System.arraycopy(b,0,tmp,a.length,b.length); return tmp; }