Example #1
0
 /**
  * Create a subframe from given interval of columns.
  *
  * @param startIdx index of first column (inclusive)
  * @param endIdx index of the last column (exclusive)
  * @return a new frame containing specified interval of columns
  */
 public Frame subframe(int startIdx, int endIdx) {
   Frame result =
       new Frame(
           Arrays.copyOfRange(_names, startIdx, endIdx),
           Arrays.copyOfRange(vecs(), startIdx, endIdx));
   return result;
 }
Example #2
0
 /** Appends a named column, keeping the last Vec as the response */
 public void add(String name, Vec vec) {
   assert _vecs.length == 0 || anyVec().group().equals(vec.group());
   final int len = _names.length;
   _names = Arrays.copyOf(_names, len + 1);
   _vecs = Arrays.copyOf(_vecs, len + 1);
   _keys = Arrays.copyOf(_keys, len + 1);
   _names[len] = name;
   _vecs[len] = vec;
   _keys[len] = vec._key;
 }
Example #3
0
 /** Removes a numbered column. */
 public Vec remove(int idx) {
   int len = _names.length;
   if (idx < 0 || idx >= len) return null;
   Vec v = _vecs[idx];
   System.arraycopy(_names, idx + 1, _names, idx, len - idx - 1);
   System.arraycopy(_vecs, idx + 1, _vecs, idx, len - idx - 1);
   System.arraycopy(_keys, idx + 1, _keys, idx, len - idx - 1);
   _names = Arrays.copyOf(_names, len - 1);
   _vecs = Arrays.copyOf(_vecs, len - 1);
   _keys = Arrays.copyOf(_keys, len - 1);
   if (v == _col0) _col0 = null;
   return v;
 }
Example #4
0
 /** Appends an entire Frame */
 public Frame add(Frame fr) {
   assert anyVec().group().equals(fr.anyVec().group());
   final int len0 = _names.length;
   final int len1 = fr._names.length;
   final int len = len0 + len1;
   _names = Arrays.copyOf(_names, len);
   _vecs = Arrays.copyOf(_vecs, len);
   _keys = Arrays.copyOf(_keys, len);
   System.arraycopy(fr._names, 0, _names, len0, len1);
   System.arraycopy(fr._vecs, 0, _vecs, len0, len1);
   System.arraycopy(fr._keys, 0, _keys, len0, len1);
   return this;
 }
Example #5
0
 @Override
 public byte[] getChunkData(int cidx) {
   if (cidx == _cidx0) return _bits0;
   if (cidx == _cidx1) return _bits1;
   assert cidx == _cidx0 + 1 || cidx == _cidx1 + 1;
   byte[] bits = _cidx0 < _cidx1 ? _bits0 : _bits1;
   if (_cidx0 < _cidx1) {
     _cidx0 = cidx;
     _coff0 = -1;
   } else {
     _cidx1 = cidx;
     _coff1 = -1;
   }
   // Read as much as the buffer will hold
   int off = 0;
   try {
     while (off < bits.length) {
       int len = _is.read(bits, off, bits.length - off);
       if (len == -1) break;
       off += len;
     }
     assert off == bits.length || _is.available() <= 0;
   } catch (IOException ioe) {
     throw new RuntimeException(ioe);
   }
   if (off == bits.length) return bits;
   // Final read is short; cache the short-read
   byte[] bits2 = (off == 0) ? null : Arrays.copyOf(bits, off);
   if (_cidx0 == cidx) _bits0 = bits2;
   else _bits1 = bits2;
   return bits2;
 }
Example #6
0
  /**
   * Remove given interval of columns from frame. Motivated by R intervals.
   *
   * @param startIdx - start index of column (inclusive)
   * @param endIdx - end index of column (exclusive)
   * @return an array of remove columns
   */
  public Vec[] remove(int startIdx, int endIdx) {
    int len = _names.length;
    int nlen = len - (endIdx - startIdx);
    String[] names = new String[nlen];
    Key[] keys = new Key[nlen];
    Vec[] vecs = new Vec[nlen];
    if (startIdx > 0) {
      System.arraycopy(_names, 0, names, 0, startIdx);
      System.arraycopy(_vecs, 0, vecs, 0, startIdx);
      System.arraycopy(_keys, 0, keys, 0, startIdx);
    }
    nlen -= startIdx;
    if (endIdx < _names.length + 1) {
      System.arraycopy(_names, endIdx, names, startIdx, nlen);
      System.arraycopy(_vecs, endIdx, vecs, startIdx, nlen);
      System.arraycopy(_keys, endIdx, keys, startIdx, nlen);
    }

    Vec[] vec = Arrays.copyOfRange(vecs(), startIdx, endIdx);
    _names = names;
    _vecs = vec;
    _keys = keys;
    _col0 = null;
    return vec;
  }
Example #7
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Val v = stk.track(asts[1].exec(env));
    if (v instanceof ValRow) {
      ValRow vv = (ValRow) v;
      return vv.slice(asts[2].columns(vv._names));
    }
    Frame fr = v.getFrame();
    int[] cols = asts[2].columns(fr.names());

    Frame fr2 = new Frame();
    if (cols.length == 0) { // Empty inclusion list?
    } else if (cols[0] >= 0) { // Positive (inclusion) list
      if (cols[cols.length - 1] > fr.numCols())
        throw new IllegalArgumentException(
            "Column must be an integer from 0 to " + (fr.numCols() - 1));
      for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]);
    } else { // Negative (exclusion) list
      fr2 = new Frame(fr); // All of them at first
      Arrays.sort(cols); // This loop depends on the values in sorted order
      for (int col : cols)
        if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column
    }

    return new ValFrame(fr2);
  }
Example #8
0
 // pretty print Matrix(2D array of doubles)
 public static String pprint(double[][] arr,DecimalFormat dformat) {
   int colDim = 0;
   for( double[] line : arr )
     colDim = Math.max(colDim, line.length);
   StringBuilder sb = new StringBuilder();
   int max_width = 0;
   int[] ilengths = new int[colDim];
   Arrays.fill(ilengths, -1);
   for( double[] line : arr ) {
     for( int c = 0; c < line.length; ++c ) {
       double d = line[c];
       String dStr = dformat.format(d);
       if( dStr.indexOf('.') == -1 ) dStr += ".0";
       ilengths[c] = Math.max(ilengths[c], dStr.indexOf('.'));
       int prefix = (d >= 0 ? 1 : 2);
       max_width = Math.max(dStr.length() + prefix, max_width);
     }
   }
   for( double[] line : arr ) {
     for( int c = 0; c < line.length; ++c ) {
       double d = line[c];
       String dStr = dformat.format(d);
       if( dStr.indexOf('.') == -1 ) dStr += ".0";
       for( int x = dStr.indexOf('.'); x < ilengths[c] + 1; ++x )
         sb.append(' ');
       sb.append(dStr);
       if( dStr.indexOf('.') == -1 ) sb.append('.');
       for( int i = dStr.length() - Math.max(0, dStr.indexOf('.')); i <= 5; ++i )
         sb.append('0');
     }
     sb.append("\n");
   }
   return sb.toString();
 }
Example #9
0
 /** Removes a numbered column. */
 public Vec[] remove(int[] idxs) {
   for (int i : idxs) if (i < 0 || i > _vecs.length) throw new ArrayIndexOutOfBoundsException();
   Arrays.sort(idxs);
   Vec[] res = new Vec[idxs.length];
   Vec[] rem = new Vec[_vecs.length - idxs.length];
   String[] names = new String[rem.length];
   Key[] keys = new Key[rem.length];
   int j = 0;
   int k = 0;
   int l = 0;
   for (int i = 0; i < _vecs.length; ++i)
     if (j < idxs.length && i == idxs[j]) {
       ++j;
       res[k++] = _vecs[i];
     } else {
       rem[l] = _vecs[i];
       names[l] = _names[i];
       keys[l] = _keys[i];
       ++l;
     }
   _vecs = rem;
   _names = names;
   _keys = keys;
   assert l == rem.length && k == idxs.length;
   return res;
 }
Example #10
0
 static public int[] difference(int a[], int b[]) {
   int[] r = new int[a.length];
   int cnt = 0;
   for (int i=0; i<a.length; i++) {
     if (!contains(b, a[i])) r[cnt++] = a[i];
   }
   return Arrays.copyOf(r, cnt);
 }
Example #11
0
 @Override
 ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) {
   Frame fr = stk.track(asts[1].exec(env)).getFrame();
   double frac = asts[2].exec(env).getNum();
   double nrow = fr.numRows() * frac;
   Vec vecs[] = fr.vecs();
   long[] idxs = new long[fr.numCols()];
   int j = 0;
   for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i;
   Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec());
   return new ValFrame(new Frame(vec));
 }
Example #12
0
 public static byte [] unzipBytes(byte [] bs, Compression cmp) {
   InputStream is = null;
   int off = 0;
   try {
     switch(cmp) {
     case NONE: // No compression
       return bs;
     case ZIP: {
       ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(bs));
       ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry
       // There is at least one entry in zip file and it is not a directory.
       if( ze != null && !ze.isDirectory() ) {
         is = zis;
         break;
       }
       zis.close();
       return bs; // Don't crash, ignore file if cannot unzip
     }
     case GZIP:
       is = new GZIPInputStream(new ByteArrayInputStream(bs));
       break;
     default:
       assert false:"cmp = " + cmp;
     }
     // If reading from a compressed stream, estimate we can read 2x uncompressed
     assert( is != null ):"is is NULL, cmp = " + cmp;
     bs = new byte[bs.length * 2];
     // Now read from the (possibly compressed) stream
     while( off < bs.length ) {
       int len = is.read(bs, off, bs.length - off);
       if( len < 0 )
         break;
       off += len;
       if( off == bs.length ) { // Dataset is uncompressing alot! Need more space...
         if( bs.length >= ValueArray.CHUNK_SZ )
           break; // Already got enough
         bs = Arrays.copyOf(bs, bs.length * 2);
       }
     }
   } catch( IOException ioe ) { // Stop at any io error
     Log.err(ioe);
   } finally {
     Utils.close(is);
   }
   return bs;
 }
Example #13
0
  public static String sampleToString(int[] val, int max) {
    if (val == null || val.length < max) return Arrays.toString(val);

    StringBuilder b = new StringBuilder();
    b.append('[');
    max -= 10;
    int valMax = val.length -1;
    for (int i = 0; ; i++) {
        b.append(val[i]);
        if (i == max) {
          b.append(", ...");
          i = val.length - 10;
        }
        if ( i == valMax) {
          return b.append(']').toString();
        }
        b.append(", ");
    }
  }
Example #14
0
 public static String join(char sep, Object[] array) {
   return join(sep, Arrays.asList(array));
 }
Example #15
0
 public static double[] append(double[] a, double e) {
   a = Arrays.copyOf(a,a.length+1);
   a[a.length-1] = e;
   return a;
 }
Example #16
0
 public static int[] remove(int[] a, int i) {
   int[] tmp = Arrays.copyOf(a,a.length-1);
   System.arraycopy(a,i+1,tmp,i,tmp.length-i);
   return tmp;
 }
Example #17
0
    /**
     * Train a Deep Learning neural net model
     *
     * @param model Input model (e.g., from initModel(), or from a previous training run)
     * @return Trained model
     */
    public final DeepLearningModel trainModel(DeepLearningModel model) {
      Frame validScoreFrame = null;
      Frame train, trainScoreFrame;
      try {
        //      if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some
        // Job's params might be uninitialized (but the restarted model's parameters are correct)
        if (model == null) {
          model = DKV.get(dest()).get();
        }
        Log.info(
            "Model category: "
                + (_parms._autoencoder
                    ? "Auto-Encoder"
                    : isClassifier() ? "Classification" : "Regression"));
        final long model_size = model.model_info().size();
        Log.info(
            "Number of model parameters (weights/biases): " + String.format("%,d", model_size));
        model.write_lock(_job);
        _job.update(0, "Setting up training data...");
        final DeepLearningParameters mp = model.model_info().get_params();

        // temporary frames of the same "name" as the orig _train/_valid (asking the parameter's
        // Key, not the actual frame)
        // Note: don't put into DKV or they would overwrite the _train/_valid frames!
        Frame tra_fr = new Frame(mp._train, _train.names(), _train.vecs());
        Frame val_fr = _valid != null ? new Frame(mp._valid, _valid.names(), _valid.vecs()) : null;

        train = tra_fr;
        if (model._output.isClassifier() && mp._balance_classes) {
          _job.update(0, "Balancing class distribution of training data...");
          float[] trainSamplingFactors =
              new float
                  [train
                      .lastVec()
                      .domain()
                      .length]; // leave initialized to 0 -> will be filled up below
          if (mp._class_sampling_factors != null) {
            if (mp._class_sampling_factors.length != train.lastVec().domain().length)
              throw new IllegalArgumentException(
                  "class_sampling_factors must have "
                      + train.lastVec().domain().length
                      + " elements");
            trainSamplingFactors =
                mp._class_sampling_factors.clone(); // clone: don't modify the original
          }
          train =
              sampleFrameStratified(
                  train,
                  train.lastVec(),
                  train.vec(model._output.weightsName()),
                  trainSamplingFactors,
                  (long) (mp._max_after_balance_size * train.numRows()),
                  mp._seed,
                  true,
                  false);
          Vec l = train.lastVec();
          Vec w = train.vec(model._output.weightsName());
          MRUtils.ClassDist cd = new MRUtils.ClassDist(l);
          model._output._modelClassDist =
              _weights != null ? cd.doAll(l, w).rel_dist() : cd.doAll(l).rel_dist();
        }
        model.training_rows = train.numRows();
        if (_weights != null && _weights.min() == 0 && _weights.max() == 1 && _weights.isInt()) {
          model.training_rows = Math.round(train.numRows() * _weights.mean());
          Log.warn(
              "Not counting "
                  + (train.numRows() - model.training_rows)
                  + " rows with weight=0 towards an epoch.");
        }
        Log.info("One epoch corresponds to " + model.training_rows + " training data rows.");
        trainScoreFrame =
            sampleFrame(
                train,
                mp._score_training_samples,
                mp._seed); // training scoring dataset is always sampled uniformly from the training
                           // dataset
        if (trainScoreFrame != train) Scope.track(trainScoreFrame);

        if (!_parms._quiet_mode)
          Log.info("Number of chunks of the training data: " + train.anyVec().nChunks());
        if (val_fr != null) {
          model.validation_rows = val_fr.numRows();
          // validation scoring dataset can be sampled in multiple ways from the given validation
          // dataset
          if (model._output.isClassifier()
              && mp._balance_classes
              && mp._score_validation_sampling
                  == DeepLearningParameters.ClassSamplingMethod.Stratified) {
            _job.update(0, "Sampling validation data (stratified)...");
            validScoreFrame =
                sampleFrameStratified(
                    val_fr,
                    val_fr.lastVec(),
                    val_fr.vec(model._output.weightsName()),
                    null,
                    mp._score_validation_samples > 0
                        ? mp._score_validation_samples
                        : val_fr.numRows(),
                    mp._seed + 1,
                    false /* no oversampling */,
                    false);
          } else {
            _job.update(0, "Sampling validation data...");
            validScoreFrame = sampleFrame(val_fr, mp._score_validation_samples, mp._seed + 1);
            if (validScoreFrame != val_fr) Scope.track(validScoreFrame);
          }
          if (!_parms._quiet_mode)
            Log.info(
                "Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks());
        }

        // Set train_samples_per_iteration size (cannot be done earlier since this depends on
        // whether stratified sampling is done)
        model.actual_train_samples_per_iteration =
            computeTrainSamplesPerIteration(mp, model.training_rows, model);
        // Determine whether shuffling is enforced
        if (mp._replicate_training_data
            && (model.actual_train_samples_per_iteration
                == model.training_rows * (mp._single_node_mode ? 1 : H2O.CLOUD.size()))
            && !mp._shuffle_training_data
            && H2O.CLOUD.size() > 1
            && !mp._reproducible) {
          if (!mp._quiet_mode)
            Log.info(
                "Enabling training data shuffling, because all nodes train on the full dataset (replicated training data).");
          mp._shuffle_training_data = true;
        }
        if (!mp._shuffle_training_data
            && model.actual_train_samples_per_iteration == model.training_rows
            && train.anyVec().nChunks() == 1) {
          if (!mp._quiet_mode)
            Log.info(
                "Enabling training data shuffling to avoid training rows in the same order over and over (no Hogwild since there's only 1 chunk).");
          mp._shuffle_training_data = true;
        }

        //        if (!mp._quiet_mode) Log.info("Initial model:\n" + model.model_info());
        long now = System.currentTimeMillis();
        model._timeLastIterationEnter = now;
        if (_parms._autoencoder) {
          _job.update(0, "Scoring null model of autoencoder...");
          if (!mp._quiet_mode) Log.info("Scoring the null model of the autoencoder.");
          model.doScoring(
              trainScoreFrame,
              validScoreFrame,
              _job._key,
              0,
              false); // get the null model reconstruction error
        }
        // put the initial version of the model into DKV
        model.update(_job);
        model.total_setup_time_ms += now - _job.start_time();
        Log.info("Total setup time: " + PrettyPrint.msecs(model.total_setup_time_ms, true));
        Log.info("Starting to train the Deep Learning model.");
        _job.update(0, "Training...");

        // main loop
        for (; ; ) {
          model.iterations++;
          model.set_model_info(
              mp._epochs == 0
                  ? model.model_info()
                  : H2O.CLOUD.size() > 1 && mp._replicate_training_data
                      ? (mp._single_node_mode
                          ? new DeepLearningTask2(
                                  _job._key,
                                  train,
                                  model.model_info(),
                                  rowFraction(train, mp, model),
                                  model.iterations)
                              .doAll(Key.make(H2O.SELF))
                              .model_info()
                          : // replicated data + single node mode
                          new DeepLearningTask2(
                                  _job._key,
                                  train,
                                  model.model_info(),
                                  rowFraction(train, mp, model),
                                  model.iterations)
                              .doAllNodes()
                              .model_info())
                      : // replicated data + multi-node mode
                      new DeepLearningTask(
                              _job._key,
                              model.model_info(),
                              rowFraction(train, mp, model),
                              model.iterations)
                          .doAll(train)
                          .model_info()); // distributed data (always in multi-node mode)
          if (stop_requested() && !timeout()) break; // cancellation
          if (!model.doScoring(
              trainScoreFrame, validScoreFrame, _job._key, model.iterations, false))
            break; // finished training (or early stopping or convergence)
          if (timeout()) break; // stop after scoring
        }

        // replace the model with the best model so far (if it's better)
        if (!stop_requested()
            && _parms._overwrite_with_best_model
            && model.actual_best_model_key != null
            && _parms._nfolds == 0) {
          DeepLearningModel best_model = DKV.getGet(model.actual_best_model_key);
          if (best_model != null
              && best_model.loss() < model.loss()
              && Arrays.equals(best_model.model_info().units, model.model_info().units)) {
            if (!_parms._quiet_mode)
              Log.info("Setting the model to be the best model so far (based on scoring history).");
            DeepLearningModelInfo mi = best_model.model_info().deep_clone();
            // Don't cheat - count full amount of training samples, since that's the amount of
            // training it took to train (without finding anything better)
            mi.set_processed_global(model.model_info().get_processed_global());
            mi.set_processed_local(model.model_info().get_processed_local());
            model.set_model_info(mi);
            model.update(_job);
            model.doScoring(trainScoreFrame, validScoreFrame, _job._key, model.iterations, true);
            assert (best_model.loss() == model.loss());
          }
        }
        // store coefficient names for future use
        // possibly change
        model.model_info().data_info().coefNames();
        if (!_parms._quiet_mode) {
          Log.info(
              "==============================================================================================================================================================================");
          if (stop_requested()) {
            Log.info("Deep Learning model training was interrupted.");
          } else {
            Log.info("Finished training the Deep Learning model.");
            Log.info(model);
          }
          Log.info(
              "==============================================================================================================================================================================");
        }
      } finally {
        if (model != null) {
          model.deleteElasticAverageModels();
          model.unlock(_job);
          if (model.actual_best_model_key != null) {
            assert (model.actual_best_model_key != model._key);
            DKV.remove(model.actual_best_model_key);
          }
        }
      }
      return model;
    }
Example #18
0
 public static float[] div(float[] nums, float n) {
   assert !Float.isInfinite(n) : "Trying to divide " + Arrays.toString(nums) + " by  " + n; // Almost surely not what you want
   for (int i=0; i<nums.length; i++) nums[i] = nums[i] / n;
   return nums;
 }
Example #19
0
 public static <T> T[] subarray(T[] a, int off, int len) {
   return Arrays.copyOfRange(a,off,off+len);
 }
Example #20
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {

    // Execute all args.  Find a canonical frame; all Frames must look like this one.
    // Each argument turns into either a Frame (whose rows are entirely
    // inlined) or a scalar (which is replicated across as a single row).
    Frame fr = null; // Canonical Frame; all frames have the same column count, types and names
    int nchks = 0; // Total chunks
    Val vals[] = new Val[asts.length]; // Computed AST results
    for (int i = 1; i < asts.length; i++) {
      vals[i] = stk.track(asts[i].exec(env));
      if (vals[i].isFrame()) {
        fr = vals[i].getFrame();
        nchks += fr.anyVec().nChunks(); // Total chunks
      } else nchks++; // One chunk per scalar
    }
    // No Frame, just a pile-o-scalars?
    Vec zz = null; // The zero-length vec for the zero-frame frame
    if (fr == null) { // Zero-length, 1-column, default name
      fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)});
      if (asts.length == 1) return new ValFrame(fr);
    }

    // Verify all Frames are the same columns, names, and types.  Domains can vary, and will be the
    // union
    final Frame frs[] = new Frame[asts.length]; // Input frame
    final byte[] types = fr.types(); // Column types
    final int ncols = fr.numCols();
    final long[] espc = new long[nchks + 1]; // Compute a new layout!
    int coffset = 0;

    for (int i = 1; i < asts.length; i++) {
      Val val = vals[i]; // Save values computed for pass 2
      Frame fr0 =
          val.isFrame()
              ? val.getFrame()
              // Scalar: auto-expand into a 1-row frame
              : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols())));

      // Check that all frames are compatible
      if (fr.numCols() != fr0.numCols())
        throw new IllegalArgumentException(
            "rbind frames must have all the same columns, found "
                + fr.numCols()
                + " and "
                + fr0.numCols()
                + " columns.");
      if (!Arrays.deepEquals(fr._names, fr0._names))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column names, found "
                + Arrays.toString(fr._names)
                + " and "
                + Arrays.toString(fr0._names));
      if (!Arrays.equals(types, fr0.types()))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column types, found "
                + Arrays.toString(types)
                + " and "
                + Arrays.toString(fr0.types()));

      frs[i] = fr0; // Save frame

      // Roll up the ESPC row counts
      long roffset = espc[coffset];
      long[] espc2 = fr0.anyVec().espc();
      for (int j = 1; j < espc2.length; j++) // Roll up the row counts
      espc[coffset + j] = (roffset + espc2[j]);
      coffset += espc2.length - 1; // Chunk offset
    }
    if (zz != null) zz.remove();

    // build up the new domains for each vec
    HashMap<String, Integer>[] dmap = new HashMap[types.length];
    String[][] domains = new String[types.length][];
    int[][][] cmaps = new int[types.length][][];
    for (int k = 0; k < types.length; ++k) {
      dmap[k] = new HashMap<>();
      int c = 0;
      byte t = types[k];
      if (t == Vec.T_CAT) {
        int[][] maps = new int[frs.length][];
        for (int i = 1; i < frs.length; i++) {
          maps[i] = new int[frs[i].vec(k).domain().length];
          for (int j = 0; j < maps[i].length; j++) {
            String s = frs[i].vec(k).domain()[j];
            if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++);
            else maps[i][j] = dmap[k].get(s);
          }
        }
        cmaps[k] = maps;
      } else {
        cmaps[k] = new int[frs.length][];
      }
      domains[k] = c == 0 ? null : new String[c];
      for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey();
    }

    // Now make Keys for the new Vecs
    Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols());
    Vec[] vecs = new Vec[fr.numCols()];
    int rowLayout = Vec.ESPC.rowLayout(keys[0], espc);
    for (int i = 0; i < vecs.length; i++)
      vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]);

    // Do the row-binds column-by-column.
    // Switch to F/J thread for continuations
    ParallelRbinds t;
    H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join();
    return new ValFrame(new Frame(fr.names(), t._vecs));
  }
Example #21
0
    /**
     * Train a Deep Learning model, assumes that all members are populated If checkpoint == null,
     * then start training a new model, otherwise continue from a checkpoint
     */
    public final void buildModel() {
      DeepLearningModel cp = null;
      if (_parms._checkpoint == null) {
        cp =
            new DeepLearningModel(
                dest(),
                _parms,
                new DeepLearningModel.DeepLearningModelOutput(DeepLearning.this),
                _train,
                _valid,
                nclasses());
        cp.model_info().initializeMembers();
      } else {
        final DeepLearningModel previous = DKV.getGet(_parms._checkpoint);
        if (previous == null) throw new IllegalArgumentException("Checkpoint not found.");
        Log.info("Resuming from checkpoint.");
        _job.update(0, "Resuming from checkpoint");

        if (isClassifier() != previous._output.isClassifier())
          throw new H2OIllegalArgumentException(
              "Response type must be the same as for the checkpointed model.");
        if (isSupervised() != previous._output.isSupervised())
          throw new H2OIllegalArgumentException(
              "Model type must be the same as for the checkpointed model.");

        // check the user-given arguments for consistency
        DeepLearningParameters oldP =
            previous._parms; // sanitized parameters for checkpointed model
        DeepLearningParameters newP = _parms; // user-given parameters for restart

        DeepLearningParameters oldP2 = (DeepLearningParameters) oldP.clone();
        DeepLearningParameters newP2 = (DeepLearningParameters) newP.clone();
        DeepLearningParameters.Sanity.modifyParms(
            oldP, oldP2, nclasses()); // sanitize the user-given parameters
        DeepLearningParameters.Sanity.modifyParms(
            newP, newP2, nclasses()); // sanitize the user-given parameters
        DeepLearningParameters.Sanity.checkpoint(oldP2, newP2);

        DataInfo dinfo;
        try {
          // PUBDEV-2513: Adapt _train and _valid (in-place) to match the frames that were used for
          // the previous model
          // This can add or remove dummy columns (can happen if the dataset is sparse and datasets
          // have different non-const columns)
          for (String st : previous.adaptTestForTrain(_train, true, false)) Log.warn(st);
          for (String st : previous.adaptTestForTrain(_valid, true, false)) Log.warn(st);
          dinfo = makeDataInfo(_train, _valid, _parms, nclasses());
          DKV.put(dinfo);
          cp = new DeepLearningModel(dest(), _parms, previous, false, dinfo);
          cp.write_lock(_job);

          if (!Arrays.equals(cp._output._names, previous._output._names)) {
            throw new H2OIllegalArgumentException(
                "The columns of the training data must be the same as for the checkpointed model. Check ignored columns (or disable ignore_const_cols).");
          }
          if (!Arrays.deepEquals(cp._output._domains, previous._output._domains)) {
            throw new H2OIllegalArgumentException(
                "Categorical factor levels of the training data must be the same as for the checkpointed model.");
          }
          if (dinfo.fullN() != previous.model_info().data_info().fullN()) {
            throw new H2OIllegalArgumentException(
                "Total number of predictors is different than for the checkpointed model.");
          }
          if (_parms._epochs <= previous.epoch_counter) {
            throw new H2OIllegalArgumentException(
                "Total number of epochs must be larger than the number of epochs already trained for the checkpointed model ("
                    + previous.epoch_counter
                    + ").");
          }

          // these are the mutable parameters that are to be used by the model (stored in
          // model_info._parms)
          final DeepLearningParameters actualNewP =
              cp.model_info()
                  .get_params(); // actually used parameters for model building (defaults filled in,
                                 // etc.)
          assert (actualNewP != previous.model_info().get_params());
          assert (actualNewP != newP);
          assert (actualNewP != oldP);
          DeepLearningParameters.Sanity.update(actualNewP, newP, nclasses());

          Log.info(
              "Continuing training after "
                  + String.format("%.3f", previous.epoch_counter)
                  + " epochs from the checkpointed model.");
          cp.update(_job);
        } catch (H2OIllegalArgumentException ex) {
          if (cp != null) {
            cp.unlock(_job);
            cp.delete();
            cp = null;
          }
          throw ex;
        } finally {
          if (cp != null) cp.unlock(_job);
        }
      }
      trainModel(cp);

      // clean up, but don't delete weights and biases if user asked for export
      List<Key> keep = new ArrayList<>();
      try {
        if (_parms._export_weights_and_biases
            && cp._output.weights != null
            && cp._output.biases != null) {
          for (Key k : Arrays.asList(cp._output.weights)) {
            keep.add(k);
            for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) {
              keep.add(vk._key);
            }
          }
          for (Key k : Arrays.asList(cp._output.biases)) {
            keep.add(k);
            for (Vec vk : ((Frame) DKV.getGet(k)).vecs()) {
              keep.add(vk._key);
            }
          }
        }
      } finally {
        Scope.exit(keep.toArray(new Key[keep.size()]));
      }
    }
Example #22
0
 public static long[][][] append(long[][][] a, long[][] e) {
   a = Arrays.copyOf(a,a.length+1);
   a[a.length-1] = e;
   return a;
 }
Example #23
0
  @SuppressWarnings("fallthrough")
  @Override
  public final DataOut parallelParse(
      int cidx, final CustomParser.DataIn din, final CustomParser.DataOut dout) {
    ValueString _str = new ValueString();
    byte[] bits = din.getChunkData(cidx);
    if (bits == null) return dout;
    int offset = din.getChunkDataStart(cidx); // General cursor into the giant array of bytes
    final byte[] bits0 = bits; // Bits for chunk0
    boolean firstChunk = true; // Have not rolled into the 2nd chunk
    byte[] bits1 = null; // Bits for chunk1, loaded lazily.
    // Starting state.  Are we skipping the first (partial) line, or not?  Skip
    // a header line, or a partial line if we're in the 2nd and later chunks.
    int state = (_setup._header || cidx > 0) ? SKIP_LINE : WHITESPACE_BEFORE_TOKEN;
    // If handed a skipping offset, then it points just past the prior partial line.
    if (offset >= 0) state = WHITESPACE_BEFORE_TOKEN;
    else offset = 0; // Else start skipping at the start
    int quotes = 0;
    long number = 0;
    int exp = 0;
    int sgn_exp = 1;
    boolean decimal = false;
    int fractionDigits = 0;
    int numStart = 0;
    int tokenStart = 0; // used for numeric token to backtrace if not successful
    int colIdx = 0;
    byte c = bits[offset];
    // skip comments for the first chunk (or if not a chunk)
    if (cidx == 0) {
      while (c == '#' || c == '@' /*also treat as comments leading '@' from ARFF format*/) {
        while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset] != CHAR_LF))
          ++offset;
        if ((offset + 1 < bits.length)
            && (bits[offset] == CHAR_CR)
            && (bits[offset + 1] == CHAR_LF)) ++offset;
        ++offset;
        if (offset >= bits.length) return dout;
        c = bits[offset];
      }
    }
    dout.newLine();

    MAIN_LOOP:
    while (true) {
      NEXT_CHAR:
      switch (state) {
          // ---------------------------------------------------------------------
        case SKIP_LINE:
          if (isEOL(c)) {
            state = EOL;
          } else {
            break NEXT_CHAR;
          }
          continue MAIN_LOOP;
          // ---------------------------------------------------------------------
        case EXPECT_COND_LF:
          state = POSSIBLE_EMPTY_LINE;
          if (c == CHAR_LF) break NEXT_CHAR;
          continue MAIN_LOOP;
          // ---------------------------------------------------------------------
        case STRING:
          if (c == quotes) {
            state = COND_QUOTE;
            break NEXT_CHAR;
          }
          if (!isEOL(c) && ((quotes != 0) || (c != CHAR_SEPARATOR))) {
            _str.addChar();
            break NEXT_CHAR;
          }
          // fallthrough to STRING_END
          // ---------------------------------------------------------------------
        case STRING_END:
          if ((c != CHAR_SEPARATOR) && (c == CHAR_SPACE)) break NEXT_CHAR;
          // we have parsed the string enum correctly
          if ((_str.get_off() + _str.get_length())
              > _str.get_buf().length) { // crossing chunk boundary
            assert _str.get_buf() != bits;
            _str.addBuff(bits);
          }
          dout.addStrCol(colIdx, _str);
          _str.set(null, 0, 0);
          ++colIdx;
          state = SEPARATOR_OR_EOL;
          // fallthrough to SEPARATOR_OR_EOL
          // ---------------------------------------------------------------------
        case SEPARATOR_OR_EOL:
          if (c == CHAR_SEPARATOR) {
            state = WHITESPACE_BEFORE_TOKEN;
            break NEXT_CHAR;
          }
          if (c == CHAR_SPACE) break NEXT_CHAR;
          // fallthrough to EOL
          // ---------------------------------------------------------------------
        case EOL:
          if (quotes != 0) {
            System.err.println(
                "Unmatched quote char "
                    + ((char) quotes)
                    + " "
                    + (((_str.get_length() + 1) < offset && _str.get_off() > 0)
                        ? new String(Arrays.copyOfRange(bits, _str.get_off() - 1, offset))
                        : ""));
            dout.invalidLine("Unmatched quote char " + ((char) quotes));
            colIdx = 0;
            quotes = 0;
          } else if (colIdx != 0) {
            dout.newLine();
            colIdx = 0;
          }
          state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE;
          if (!firstChunk) break MAIN_LOOP; // second chunk only does the first row
          break NEXT_CHAR;
          // ---------------------------------------------------------------------
        case POSSIBLE_CURRENCY:
          if (((c >= '0') && (c <= '9'))
              || (c == '-')
              || (c == CHAR_DECIMAL_SEPARATOR)
              || (c == '+')) {
            state = TOKEN;
          } else {
            _str.set(bits, offset - 1, 0);
            _str.addChar();
            if (c == quotes) {
              state = COND_QUOTE;
              break NEXT_CHAR;
            }
            if ((quotes != 0) || ((!isEOL(c) && (c != CHAR_SEPARATOR)))) {
              state = STRING;
            } else {
              state = STRING_END;
            }
          }
          continue MAIN_LOOP;
          // ---------------------------------------------------------------------
        case POSSIBLE_EMPTY_LINE:
          if (isEOL(c)) {
            if (c == CHAR_CR) state = EXPECT_COND_LF;
            break NEXT_CHAR;
          }
          state = WHITESPACE_BEFORE_TOKEN;
          // fallthrough to WHITESPACE_BEFORE_TOKEN
          // ---------------------------------------------------------------------
        case WHITESPACE_BEFORE_TOKEN:
          if (c == CHAR_SPACE || (c == CHAR_TAB && CHAR_TAB != CHAR_SEPARATOR)) {
            break NEXT_CHAR;
          } else if (c == CHAR_SEPARATOR) {
            // we have empty token, store as NaN
            dout.addInvalidCol(colIdx++);
            break NEXT_CHAR;
          } else if (isEOL(c)) {
            dout.addInvalidCol(colIdx++);
            state = EOL;
            continue MAIN_LOOP;
          }
          // fallthrough to COND_QUOTED_TOKEN
          // ---------------------------------------------------------------------
        case COND_QUOTED_TOKEN:
          state = TOKEN;
          if (CHAR_SEPARATOR != HIVE_SEP
              && // Only allow quoting in CSV not Hive files
              ((_setup._singleQuotes && c == CHAR_SINGLE_QUOTE) || (c == CHAR_DOUBLE_QUOTE))) {
            assert (quotes == 0);
            quotes = c;
            break NEXT_CHAR;
          }
          // fallthrough to TOKEN
          // ---------------------------------------------------------------------
        case TOKEN:
          if (dout.isString(colIdx)) { // Forced already to a string col?
            state = STRING; // Do not attempt a number parse, just do a string parse
            _str.set(bits, offset, 0);
            continue MAIN_LOOP;
          } else if (((c >= '0') && (c <= '9'))
              || (c == '-')
              || (c == CHAR_DECIMAL_SEPARATOR)
              || (c == '+')) {
            state = NUMBER;
            number = 0;
            fractionDigits = 0;
            decimal = false;
            numStart = offset;
            tokenStart = offset;
            if (c == '-') {
              exp = -1;
              ++numStart;
              break NEXT_CHAR;
            } else if (c == '+') {
              exp = 1;
              ++numStart;
              break NEXT_CHAR;
            } else {
              exp = 1;
            }
            // fallthrough
          } else if (c == '$') {
            state = POSSIBLE_CURRENCY;
            break NEXT_CHAR;
          } else {
            state = STRING;
            _str.set(bits, offset, 0);
            continue MAIN_LOOP;
          }
          // fallthrough to NUMBER
          // ---------------------------------------------------------------------
        case NUMBER:
          if ((c >= '0') && (c <= '9')) {
            number = (number * 10) + (c - '0');
            if (number >= LARGEST_DIGIT_NUMBER) state = NUMBER_SKIP;
            break NEXT_CHAR;
          } else if (c == CHAR_DECIMAL_SEPARATOR) {
            ++numStart;
            state = NUMBER_FRACTION;
            fractionDigits = offset;
            decimal = true;
            break NEXT_CHAR;
          } else if ((c == 'e') || (c == 'E')) {
            ++numStart;
            state = NUMBER_EXP_START;
            sgn_exp = 1;
            break NEXT_CHAR;
          }
          if (exp == -1) {
            number = -number;
          }
          exp = 0;
          // fallthrough to COND_QUOTED_NUMBER_END
          // ---------------------------------------------------------------------
        case COND_QUOTED_NUMBER_END:
          if (c == quotes) {
            state = NUMBER_END;
            quotes = 0;
            break NEXT_CHAR;
          }
          // fallthrough NUMBER_END
        case NUMBER_END:
          if (c == CHAR_SEPARATOR && quotes == 0) {
            exp = exp - fractionDigits;
            dout.addNumCol(colIdx, number, exp);
            ++colIdx;
            // do separator state here too
            state = WHITESPACE_BEFORE_TOKEN;
            break NEXT_CHAR;
          } else if (isEOL(c)) {
            exp = exp - fractionDigits;
            dout.addNumCol(colIdx, number, exp);
            // do EOL here for speedup reasons
            colIdx = 0;
            dout.newLine();
            state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE;
            if (!firstChunk) break MAIN_LOOP; // second chunk only does the first row
            break NEXT_CHAR;
          } else if ((c == '%')) {
            state = NUMBER_END;
            exp -= 2;
            break NEXT_CHAR;
          } else if ((c != CHAR_SEPARATOR) && ((c == CHAR_SPACE) || (c == CHAR_TAB))) {
            state = NUMBER_END;
            break NEXT_CHAR;
          } else {
            state = STRING;
            offset = tokenStart - 1;
            _str.set(bits, tokenStart, 0);
            break NEXT_CHAR; // parse as String token now
          }
          // ---------------------------------------------------------------------
        case NUMBER_SKIP:
          ++numStart;
          if ((c >= '0') && (c <= '9')) {
            break NEXT_CHAR;
          } else if (c == CHAR_DECIMAL_SEPARATOR) {
            state = NUMBER_SKIP_NO_DOT;
            break NEXT_CHAR;
          } else if ((c == 'e') || (c == 'E')) {
            state = NUMBER_EXP_START;
            sgn_exp = 1;
            break NEXT_CHAR;
          }
          state = COND_QUOTED_NUMBER_END;
          continue MAIN_LOOP;
          // ---------------------------------------------------------------------
        case NUMBER_SKIP_NO_DOT:
          ++numStart;
          if ((c >= '0') && (c <= '9')) {
            break NEXT_CHAR;
          } else if ((c == 'e') || (c == 'E')) {
            state = NUMBER_EXP_START;
            sgn_exp = 1;
            break NEXT_CHAR;
          }
          state = COND_QUOTED_NUMBER_END;
          continue MAIN_LOOP;
          // ---------------------------------------------------------------------
        case NUMBER_FRACTION:
          if ((c >= '0') && (c <= '9')) {
            if (number >= LARGEST_DIGIT_NUMBER) {
              if (decimal) fractionDigits = offset - 1 - fractionDigits;
              state = NUMBER_SKIP;
            } else {
              number = (number * 10) + (c - '0');
            }
            break NEXT_CHAR;
          } else if ((c == 'e') || (c == 'E')) {
            ++numStart;
            if (decimal) fractionDigits = offset - 1 - fractionDigits;
            state = NUMBER_EXP_START;
            sgn_exp = 1;
            break NEXT_CHAR;
          }
          state = COND_QUOTED_NUMBER_END;
          if (decimal) fractionDigits = offset - fractionDigits - 1;
          if (exp == -1) {
            number = -number;
          }
          exp = 0;
          continue MAIN_LOOP;
          // ---------------------------------------------------------------------
        case NUMBER_EXP_START:
          if (exp == -1) {
            number = -number;
          }
          exp = 0;
          if (c == '-') {
            ++numStart;
            sgn_exp *= -1;
            break NEXT_CHAR;
          } else if (c == '+') {
            ++numStart;
            break NEXT_CHAR;
          }
          if ((c < '0') || (c > '9')) {
            state = STRING;
            offset = tokenStart - 1;
            _str.set(bits, tokenStart, 0);
            break NEXT_CHAR; // parse as String token now
          }
          state = NUMBER_EXP; // fall through to NUMBER_EXP
          // ---------------------------------------------------------------------
        case NUMBER_EXP:
          if ((c >= '0') && (c <= '9')) {
            ++numStart;
            exp = (exp * 10) + (c - '0');
            break NEXT_CHAR;
          }
          exp *= sgn_exp;
          state = COND_QUOTED_NUMBER_END;
          continue MAIN_LOOP;

          // ---------------------------------------------------------------------
        case COND_QUOTE:
          if (c == quotes) {
            _str.addChar();
            _str.skipChar();
            state = STRING;
            break NEXT_CHAR;
          } else {
            quotes = 0;
            state = STRING_END;
            continue MAIN_LOOP;
          }
          // ---------------------------------------------------------------------
        default:
          assert (false) : " We have wrong state " + state;
      } // end NEXT_CHAR
      ++offset; // do not need to adjust for offset increase here - the offset is set to
      // tokenStart-1!
      if (offset < 0) { // Offset is negative?
        assert !firstChunk; // Caused by backing up from 2nd chunk into 1st chunk
        firstChunk = true;
        bits = bits0;
        offset += bits.length;
        _str.set(bits, offset, 0);
      } else if (offset >= bits.length) { // Off end of 1st chunk?  Parse into 2nd chunk
        // Attempt to get more data.
        if (firstChunk && bits1 == null) bits1 = din.getChunkData(cidx + 1);
        // if we can't get further we might have been the last one and we must
        // commit the latest guy if we had one.
        if (!firstChunk || bits1 == null) { // No more data available or allowed
          // If we are mid-parse of something, act like we saw a LF to end the
          // current token.
          if ((state != EXPECT_COND_LF) && (state != POSSIBLE_EMPTY_LINE)) {
            c = CHAR_LF;
            continue MAIN_LOOP;
          }
          break MAIN_LOOP; // Else we are just done
        }

        // Now parsing in the 2nd chunk.  All offsets relative to the 2nd chunk start.
        firstChunk = false;
        numStart -= bits.length;
        if (state == NUMBER_FRACTION) fractionDigits -= bits.length;
        offset -= bits.length;
        tokenStart -= bits.length;
        bits = bits1; // Set main parsing loop bits
        if (bits[0] == CHAR_LF && state == EXPECT_COND_LF)
          break MAIN_LOOP; // when the first character we see is a line end
      }
      c = bits[offset];
      if (isEOL(c)
          && state != COND_QUOTE
          && quotes != 0) // quoted string having newline character => fail the line!
      state = EOL;
    } // end MAIN_LOOP
    if (colIdx == 0) dout.rollbackLine();
    // If offset is still validly within the buffer, save it so the next pass
    // can start from there.
    if (offset + 1 < bits.length) {
      if (state == EXPECT_COND_LF && bits[offset + 1] == CHAR_LF) offset++;
      if (offset + 1 < bits.length) din.setChunkDataStart(cidx + 1, offset + 1);
    }
    return dout;
  }
Example #24
0
 public static <T> T[] append(T[] a, T... b) {
   if( a==null ) return b;
   T[] tmp = Arrays.copyOf(a,a.length+b.length);
   System.arraycopy(b,0,tmp,a.length,b.length);
   return tmp;
 }