예제 #1
0
파일: DataInfo.java 프로젝트: liaochy/h2o-3
  /**
   * Extract (dense) rows from given chunks, one Vec at a time - should be slightly faster than
   * per-row
   *
   * @param chunks - chunk of dataset
   * @return array of dense rows
   */
  public final Row[] extractDenseRowsVertical(Chunk[] chunks) {
    Row[] rows = new Row[chunks[0]._len];

    for (int i = 0; i < rows.length; ++i) {
      rows[i] = new Row(false, _nums, _cats, _responses, 0);
      rows[i].rid = chunks[0].start() + i;
      if (_offset) {
        rows[i].offset = chunks[offsetChunkId()].atd(i);
        if (Double.isNaN(rows[i].offset)) rows[i].bad = true;
      }
      if (_weights) {
        rows[i].weight = chunks[weightChunkId()].atd(i);
        if (Double.isNaN(rows[i].weight)) rows[i].bad = true;
      }
    }
    for (int i = 0; i < _cats; ++i) {
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        if (chunks[i].isNA(r)) {
          if (_skipMissing) {
            row.bad = true;
          } else
            row.binIds[row.nBins++] =
                _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor
        } else {
          int c = getCategoricalId(i, (int) chunks[i].at8(r));
          if (c >= 0) row.binIds[row.nBins++] = c;
        }
      }
    }
    int numStart = numStart();
    // generic numbers
    for (int cid = 0; cid < _nums; ++cid) {
      Chunk c = chunks[_cats + cid];
      for (int r = 0; r < c._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        if (c.isNA(r)) row.bad = _skipMissing;
        double d = c.atd(r);
        if (_normMul != null && _normSub != null) // either none or both
        d = (d - _normSub[cid]) * _normMul[cid];
        row.numVals[numStart + cid] = d;
      }
    }
    // response(s)
    for (int i = 1; i <= _responses; ++i) {
      Chunk rChunk = chunks[responseChunkId()];
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        row.response[row.response.length - i] = rChunk.atd(r);
        if (_normRespMul != null) {
          row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1];
        }
        if (Double.isNaN(row.response[row.response.length - i])) row.bad = true;
      }
    }
    return rows;
  }
예제 #2
0
 @Override
 public void map(Chunk strata, Chunk newW) {
   for (int i = 0; i < strata._len; ++i) {
     //          Log.info("NID:" + ((int) strata.at8(i)));
     if ((int) strata.at8(i) != stratumToKeep) newW.set(i, 0);
   }
 }
예제 #3
0
    @Override
    public void map(Chunk cs[]) {
      int ncoly = cs.length;
      _ysum = new double[ncoly];

      double[] yvals = new double[ncoly];
      double yval;
      boolean add;
      int len = cs[0]._len;
      for (int row = 0; row < len; row++) {
        add = true;
        Arrays.fill(yvals, 0);

        for (int y = 0; y < ncoly; y++) {
          final Chunk cy = cs[y];
          yval = cy.atd(row);
          // if any yval along a row is NA, discard the entire row
          if (Double.isNaN(yval)) {
            _NACount++;
            add = false;
            break;
          }
          yvals[y] = yval;
        }
        if (add) {
          ArrayUtils.add(_ysum, yvals);
        }
      }
    }
예제 #4
0
파일: DRF.java 프로젝트: liangexiang/h2o-3
 @Override
 public void map(Chunk nids, Chunk ys) {
   Random rand = _tree.rngForChunk(nids.cidx());
   for (int row = 0; row < nids._len; row++)
     if (rand.nextFloat() >= _rate || Double.isNaN(ys.atd(row))) {
       nids.set(row, ScoreBuildHistogram.OUT_OF_BAG); // Flag row as being ignored by sampling
     }
 }
예제 #5
0
 @Override
 public void map(Chunk c, Chunk w) {
   for (int i = 0; i < c.len(); ++i)
     if (!c.isNA(i)) {
       double wt = w.atd(i);
       //          For now: let the user give small weights, results are probably not very good
       // (same as for wtd.quantile in R)
       //          if (wt > 0 && wt < 1) throw new H2OIllegalArgumentException("Quantiles only
       // accepts weights that are either 0 or >= 1.");
       sum += wt;
     }
 }
예제 #6
0
 @Override
 public void map(Chunk response, Chunk weight, Chunk offset) {
   for (int i = 0; i < response._len; ++i) {
     if (response.isNA(i)) continue;
     double w = weight.atd(i);
     if (w == 0) continue;
     double y = response.atd(i);
     double o = offset.atd(i);
     _num += _dist.initFNum(w, o, y);
     _denom += _dist.initFDenom(w, o);
   }
 }
예제 #7
0
    @Override
    public void map(Chunk cs[]) {
      int ncolx = _xmeans.length;
      int ncoly = _ymeans.length;
      double[] xvals = new double[ncolx];
      double[] yvals = new double[ncoly];
      _covs = new double[ncoly][ncolx];
      double[] _covs_y;
      double xval, yval, ymean;
      boolean add;
      int len = cs[0]._len;
      for (int row = 0; row < len; row++) {
        add = true;
        // reset existing arrays to 0 rather than initializing new ones to save on garbage
        // collection
        Arrays.fill(xvals, 0);
        Arrays.fill(yvals, 0);

        for (int y = 0; y < ncoly; y++) {
          final Chunk cy = cs[y];
          yval = cy.atd(row);
          // if any yval along a row is NA, discard the entire row
          if (Double.isNaN(yval)) {
            add = false;
            break;
          }
          yvals[y] = yval;
        }
        if (add) {
          for (int x = 0; x < ncolx; x++) {
            final Chunk cx = cs[x + ncoly];
            xval = cx.atd(row);
            // if any xval along a row is NA, discard the entire row
            if (Double.isNaN(xval)) {
              add = false;
              break;
            }
            xvals[x] = xval;
          }
        }
        // add is true iff row has been traversed and found no NAs among yvals and xvals
        if (add) {
          for (int y = 0; y < ncoly; y++) {
            _covs_y = _covs[y];
            yval = yvals[y];
            ymean = _ymeans[y];
            for (int x = 0; x < ncolx; x++) _covs_y[x] += (xvals[x] - _xmeans[x]) * (yval - ymean);
          }
        }
      }
    }
예제 #8
0
파일: DRF.java 프로젝트: liangexiang/h2o-3
 @Override
 public void map(Chunk chks[]) {
   Chunk cy = chk_resp(chks);
   for (int i = 0; i < cy._len; i++) {
     if (cy.isNA(i)) continue;
     if (isClassifier()) {
       int cls = (int) cy.at8(i);
       chk_work(chks, cls).set(i, 1L);
     } else {
       float pred = (float) cy.atd(i);
       chk_work(chks, 0).set(i, pred);
     }
   }
 }
예제 #9
0
    @Override
    public void map(Chunk cs[]) {
      _xsum = new double[_ncolx];
      _ysum = new double[_ncoly];

      double[] xvals = new double[_ncolx];
      double[] yvals = new double[_ncoly];

      double xval, yval;
      boolean add;
      int len = cs[0]._len;
      for (int row = 0; row < len; row++) {
        add = true;
        // reset existing arrays to 0 rather than initializing new ones to save on garbage
        // collection
        Arrays.fill(xvals, 0);
        Arrays.fill(yvals, 0);

        for (int y = 0; y < _ncoly; y++) {
          final Chunk cy = cs[y];
          yval = cy.atd(row);
          // if any yval along a row is NA, discard the entire row
          if (Double.isNaN(yval)) {
            _NACount++;
            add = false;
            break;
          }
          yvals[y] = yval;
        }
        if (add) {
          for (int x = 0; x < _ncolx; x++) {
            final Chunk cx = cs[x + _ncoly];
            xval = cx.atd(row);
            // if any xval along a row is NA, discard the entire row
            if (Double.isNaN(xval)) {
              _NACount++;
              add = false;
              break;
            }
            xvals[x] = xval;
          }
        }
        // add is true iff row has been traversed and found no NAs among yvals and xvals
        if (add) {
          ArrayUtils.add(_xsum, xvals);
          ArrayUtils.add(_ysum, yvals);
        }
      }
    }
예제 #10
0
 @Override
 public void map(Chunk cs[]) {
   final int ncolsx = cs.length - 1;
   final Chunk cy = cs[0];
   final int len = cy._len;
   _covs = new double[ncolsx];
   double sum;
   for (int x = 0; x < ncolsx; x++) {
     sum = 0;
     final Chunk cx = cs[x + 1];
     final double xmean = _xmeans[x];
     for (int row = 0; row < len; row++) sum += (cx.atd(row) - xmean) * (cy.atd(row) - _ymean);
     _covs[x] = sum;
   }
 }
예제 #11
0
파일: DataInfo.java 프로젝트: liaochy/h2o-3
  public final Row extractDenseRow(Chunk[] chunks, int rid, Row row) {
    row.bad = false;
    row.rid = rid + chunks[0].start();
    if (_weights) row.weight = chunks[weightChunkId()].atd(rid);
    if (row.weight == 0) return row;
    if (_skipMissing)
      for (Chunk c : chunks)
        if (c.isNA(rid)) {
          row.bad = true;
          return row;
        }
    int nbins = 0;
    for (int i = 0; i < _cats; ++i) {
      if (chunks[i].isNA(rid)) {
        if (_imputeMissing) {
          int c = getCategoricalId(i, _catModes[i]);
          if (c >= 0) row.binIds[nbins++] = c;
        } else // TODO: What if missingBucket = false?
        row.binIds[nbins++] =
              _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor
      } else {
        int c = getCategoricalId(i, (int) chunks[i].at8(rid));
        if (c >= 0) row.binIds[nbins++] = c;
      }
    }
    row.nBins = nbins;
    final int n = _nums;
    for (int i = 0; i < n; ++i) {
      double d = chunks[_cats + i].atd(rid); // can be NA if skipMissing() == false
      if (_imputeMissing && Double.isNaN(d)) d = _numMeans[i];
      if (_normMul != null && _normSub != null) d = (d - _normSub[i]) * _normMul[i];
      row.numVals[i] = d;
    }
    for (int i = 0; i < _responses; ++i) {
      row.response[i] = chunks[responseChunkId()].atd(rid);
      if (_normRespMul != null)
        row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i];
      if (Double.isNaN(row.response[i])) {
        row.bad = true;
        return row;
      }
    }
    if (_offset) row.offset = chunks[offsetChunkId()].atd(rid);

    return row;
  }
예제 #12
0
파일: MRUtils.java 프로젝트: Jrobinso09/h2o
 /**
  * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one +
  * one-to-all)
  *
  * @param fr Input frame
  * @param seed RNG seed
  * @param shuffle whether to shuffle the data globally
  * @return Shuffled frame
  */
 public static Frame shuffleAndBalance(
     final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) {
   if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) {
     Vec[] vecs = fr.vecs().clone();
     Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks.");
     long[] idx = null;
     if (shuffle) {
       idx = new long[splits];
       for (int r = 0; r < idx.length; ++r) idx[r] = r;
       Utils.shuffleArray(idx, seed);
     }
     Key keys[] = new Vec.VectorGroup().addVecs(vecs.length);
     final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits));
     // loop over cols (same indexing for each column)
     Futures fs = new Futures();
     for (int col = 0; col < vecs.length; col++) {
       AppendableVec vec = new AppendableVec(keys[col]);
       // create outgoing chunks for this col
       NewChunk[] outCkg = new NewChunk[splits];
       for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i);
       // loop over all incoming chunks
       for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) {
         final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg);
         // loop over local rows of incoming chunks (fast path)
         for (int row = 0; row < inCkg._len; ++row) {
           int outCkgIdx =
               (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx
           if (shuffle)
             outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk
           assert (outCkgIdx >= 0 && outCkgIdx < splits);
           outCkg[outCkgIdx].addNum(inCkg.at0(row));
         }
       }
       for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs);
       Vec t = vec.close(fs);
       t._domain = vecs[col]._domain;
       vecs[col] = t;
     }
     fs.blockForPending();
     Log.info("Load balancing done.");
     return new Frame(fr.names(), vecs);
   }
   return fr;
 }
예제 #13
0
파일: GBM.java 프로젝트: shjgiser/h2o
 @Override
 public void map(Chunk chks[]) {
   Chunk ys = chk_resp(chks);
   if (_nclass > 1) { // Classification
     float fs[] = new float[_nclass + 1];
     for (int row = 0; row < ys._len; row++) {
       float sum = score1(chks, fs, row);
       if (Float.isInfinite(sum)) // Overflow (happens for constant responses)
       for (int k = 0; k < _nclass; k++)
           chk_work(chks, k).set0(row, Float.isInfinite(fs[k + 1]) ? 1.0f : 0.0f);
       else
         for (int k = 0; k < _nclass; k++) // Save as a probability distribution
         chk_work(chks, k).set0(row, fs[k + 1] / sum);
     }
   } else { // Regression
     Chunk tr = chk_tree(chks, 0); // Prior tree sums
     Chunk wk = chk_work(chks, 0); // Predictions
     for (int row = 0; row < ys._len; row++) wk.set0(row, (float) tr.at0(row));
   }
 }
예제 #14
0
파일: AUC.java 프로젝트: Jrobinso09/h2o
 @Override
 public void map(Chunk ca, Chunk cp) {
   _cms = new hex.ConfusionMatrix[_thresh.length];
   for (int i = 0; i < _cms.length; ++i) _cms[i] = new hex.ConfusionMatrix(2);
   final int len = Math.min(ca._len, cp._len);
   for (int i = 0; i < len; i++) {
     if (ca.isNA0(i)) continue;
     //          throw new UnsupportedOperationException("Actual class label cannot be a missing
     // value!");
     final int a = (int) ca.at80(i); // would be a 0 if double was NaN
     assert (a == 0 || a == 1) : "Invalid values in vactual: must be binary (0 or 1).";
     if (cp.isNA0(i)) {
       //          Log.warn("Skipping predicted NaN."); //some models predict NaN!
       continue;
     }
     final double pr = cp.at0(i);
     for (int t = 0; t < _cms.length; t++) {
       final int p = pr >= _thresh[t] ? 1 : 0;
       _cms[t].add(a, p);
     }
   }
 }
예제 #15
0
파일: GBM.java 프로젝트: shjgiser/h2o
    @Override
    public void map(Chunk chks[]) {
      Chunk ys = chk_resp(chks);
      if (_nclass > 1) { // Classification

        for (int row = 0; row < ys._len; row++) {
          if (ys.isNA0(row)) continue;
          int y = (int) ys.at80(row); // zero-based response variable
          // Actual is '1' for class 'y' and '0' for all other classes
          for (int k = 0; k < _nclass; k++) {
            if (_distribution[k] != 0) {
              Chunk wk = chk_work(chks, k);
              wk.set0(row, (y == k ? 1f : 0f) - (float) wk.at0(row));
            }
          }
        }

      } else { // Regression
        Chunk wk = chk_work(chks, 0); // Prediction==>Residuals
        for (int row = 0; row < ys._len; row++) wk.set0(row, (float) (ys.at0(row) - wk.at0(row)));
      }
    }
예제 #16
0
 @Override
 public void map(Chunk chk, Chunk weight) {
   _bins = new double[_nbins];
   _mins = new double[_nbins];
   _maxs = new double[_nbins];
   Arrays.fill(_mins, Double.MAX_VALUE);
   Arrays.fill(_maxs, -Double.MAX_VALUE);
   double d;
   for (int row = 0; row < chk._len; row++) {
     double w = weight.atd(row);
     if (w == 0) continue;
     if (!Double.isNaN(d = chk.atd(row))) { // na.rm=true
       double idx = (d - _lb) / _step;
       if (!(0.0 <= idx && idx < _bins.length)) continue;
       int i = (int) idx;
       if (_bins[i] == 0) _mins[i] = _maxs[i] = d; // Capture unique value
       else {
         if (d < _mins[i]) _mins[i] = d;
         if (d > _maxs[i]) _maxs[i] = d;
       }
       _bins[i] += w; // Bump row counts by row weight
     }
   }
 }
예제 #17
0
 @Override
 public void map(Chunk cs) {
   int idx = _chunkOffset + cs.cidx();
   Key ckey = Vec.chunkKey(_v._key, idx);
   if (_cmap != null) {
     assert !cs.hasFloat()
         : "Input chunk (" + cs.getClass() + ") has float, but is expected to be categorical";
     NewChunk nc = new NewChunk(_v, idx);
     // loop over rows and update ints for new domain mapping according to vecs[c].domain()
     for (int r = 0; r < cs._len; ++r) {
       if (cs.isNA(r)) nc.addNA();
       else nc.addNum(_cmap[(int) cs.at8(r)], 0);
     }
     nc.close(_fs);
   } else {
     DKV.put(ckey, cs.deepCopy(), _fs, true);
   }
 }
예제 #18
0
파일: GBM.java 프로젝트: shjgiser/h2o
 @Override
 public void map(Chunk[] chks) {
   _gss = new double[_nclass][];
   _rss = new double[_nclass][];
   // For all tree/klasses
   for (int k = 0; k < _nclass; k++) {
     final DTree tree = _trees[k];
     final int leaf = _leafs[k];
     if (tree == null) continue; // Empty class is ignored
     // A leaf-biased array of all active Tree leaves.
     final double gs[] = _gss[k] = new double[tree._len - leaf];
     final double rs[] = _rss[k] = new double[tree._len - leaf];
     final Chunk nids = chk_nids(chks, k); // Node-ids  for this tree/class
     final Chunk ress = chk_work(chks, k); // Residuals for this tree/class
     // If we have all constant responses, then we do not split even the
     // root and the residuals should be zero.
     if (tree.root() instanceof LeafNode) continue;
     for (int row = 0; row < nids._len; row++) { // For all rows
       int nid = (int) nids.at80(row); // Get Node to decide from
       if (nid < 0) continue; // Missing response
       if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree
       nid = tree.node(nid)._pid; // Then take parent's decision
       DecidedNode dn = tree.decided(nid); // Must have a decision point
       if (dn._split._col == -1) // Unable to decide?
       dn = tree.decided(nid = dn._pid); // Then take parent's decision
       int leafnid = dn.ns(chks, row); // Decide down to a leafnode
       assert leaf <= leafnid && leafnid < tree._len;
       assert tree.node(leafnid) instanceof LeafNode;
       // Note: I can which leaf/region I end up in, but I do not care for
       // the prediction presented by the tree.  For GBM, we compute the
       // sum-of-residuals (and sum/abs/mult residuals) for all rows in the
       // leaf, and get our prediction from that.
       nids.set0(row, leafnid);
       assert !ress.isNA0(row);
       double res = ress.at0(row);
       double ares = Math.abs(res);
       gs[leafnid - leaf] += _nclass > 1 ? ares * (1 - ares) : 1;
       rs[leafnid - leaf] += res;
     }
   }
 }
예제 #19
0
 @Override
 public void map(Chunk chk) {
   map(chk, new C0DChunk(1, chk.len()));
 }
예제 #20
0
  /**
   * Extracts the values, applies regularization to numerics, adds appropriate offsets to
   * categoricals, and adapts response according to the CaseMode/CaseValue if set.
   */
  @Override
  public final void map(Chunk[] chunks, NewChunk[] outputs) {
    if (_job != null && _job.self() != null && !Job.isRunning(_job.self()))
      throw new JobCancelledException();
    final int nrows = chunks[0]._len;
    final long offset = chunks[0]._start;
    chunkInit();
    double[] nums = MemoryManager.malloc8d(_dinfo._nums);
    int[] cats = MemoryManager.malloc4(_dinfo._cats);
    double[] response = MemoryManager.malloc8d(_dinfo._responses);
    int start = 0;
    int end = nrows;

    boolean contiguous = false;
    Random skip_rng = null; // random generator for skipping rows
    if (_useFraction < 1.0) {
      skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong());
      if (contiguous) {
        final int howmany = (int) Math.ceil(_useFraction * nrows);
        if (howmany > 0) {
          start = skip_rng.nextInt(nrows - howmany);
          end = start + howmany;
        }
        assert (start < nrows);
        assert (end <= nrows);
      }
    }

    long[] shuf_map = null;
    if (_shuffle) {
      shuf_map = new long[end - start];
      for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i;
      Utils.shuffleArray(shuf_map, new Random().nextLong());
    }

    OUTER:
    for (int rr = start; rr < end; ++rr) {
      final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr;
      if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId)
          || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue;
      for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs!
      int i = 0, ncats = 0;
      for (; i < _dinfo._cats; ++i) {
        int c = (int) chunks[i].at80(r);
        if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1;
      }
      final int n = chunks.length - _dinfo._responses;
      for (; i < n; ++i) {
        double d = chunks[i].at0(r);
        if (_dinfo._normMul != null)
          d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats];
        nums[i - _dinfo._cats] = d;
      }
      for (i = 0; i < _dinfo._responses; ++i) {
        response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r);
        if (_dinfo._normRespMul != null)
          response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i];
      }
      if (outputs != null && outputs.length > 0)
        processRow(offset + r, nums, ncats, cats, response, outputs);
      else processRow(offset + r, nums, ncats, cats, response);
    }
    chunkDone();
  }
예제 #21
0
파일: GBM.java 프로젝트: shjgiser/h2o
 @Override
 public void map(Chunk chks[]) {
   Chunk ys = chk_resp(chks);
   for (int row = 0; row < ys._len; row++)
     if (ys.isNA0(row)) for (int t = 0; t < _nclass; t++) chk_nids(chks, t).set0(row, -1);
 }
예제 #22
0
 /**
  * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used
  * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only)
  * Essentially turns the dataset 90 degrees.
  *
  * @param chunks - chunk of dataset
  * @return array of sparse rows
  */
 public final Row[] extractSparseRows(Chunk[] chunks) {
   Row[] rows = new Row[chunks[0]._len];
   long startOff = chunks[0].start();
   for (int i = 0; i < rows.length; ++i) {
     rows[i] =
         new Row(
             true,
             Math.min(_nums, 16),
             _cats,
             _responses,
             i,
             startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not
     // use numNums()
     rows[i].rid = chunks[0].start() + i;
     if (_offset) {
       rows[i].offset = chunks[offsetChunkId()].atd(i);
       if (Double.isNaN(rows[i].offset)) rows[i].bad = true;
     }
     if (_weights) {
       rows[i].weight = chunks[weightChunkId()].atd(i);
       if (Double.isNaN(rows[i].weight)) rows[i].bad = true;
     }
     if (_skipMissing) {
       int N = _cats + _nums;
       for (int c = 0; c < N; ++c) if (chunks[c].isNA(i)) rows[i].bad = true;
     }
   }
   // categoricals
   for (int i = 0; i < _cats; ++i) {
     for (int r = 0; r < chunks[0]._len; ++r) {
       Row row = rows[r];
       if (row.bad) continue;
       int cid = getCategoricalId(i, chunks[i].isNA(r) ? _catModes[i] : (int) chunks[i].at8(r));
       if (cid >= 0) row.binIds[row.nBins++] = cid;
     }
   }
   // generic numbers + interactions
   int interactionOffset = 0;
   for (int cid = 0; cid < _nums; ++cid) {
     Chunk c = chunks[_cats + cid];
     int oldRow = -1;
     if (c
         instanceof
         InteractionWrappedVec
             .InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot'
       // all other values are off (i.e., are 0)
       for (int r = 0;
           r < c._len;
           ++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has
         // one, and only one, value)
         Row row = rows[r];
         if (row.bad) continue;
         if (c.isNA(r)) row.bad = _skipMissing;
         int cidVirtualOffset =
             getInteractionOffset(
                 chunks, _cats + cid, r); // the "virtual" offset into the hot-expanded interaction
         row.addNum(
             _numOffsets[cid] + cidVirtualOffset,
             c.atd(r)); // FIXME: if this produces a "true" NA then should sub with mean? with?
       }
       interactionOffset += nextNumericIdx(cid);
     } else {
       for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) {
         if (c.atd(r) == 0) continue;
         assert r > oldRow;
         oldRow = r;
         Row row = rows[r];
         if (row.bad) continue;
         if (c.isNA(r)) row.bad = _skipMissing;
         double d = c.atd(r);
         if (Double.isNaN(d)) d = _numMeans[cid];
         if (_normMul != null) d *= _normMul[interactionOffset];
         row.addNum(_numOffsets[cid], d);
       }
       interactionOffset++;
     }
   }
   // response(s)
   for (int i = 1; i <= _responses; ++i) {
     int rid = responseChunkId(i - 1);
     Chunk rChunk = chunks[rid];
     for (int r = 0; r < chunks[0]._len; ++r) {
       Row row = rows[r];
       if (row.bad) continue;
       row.response[i - 1] = rChunk.atd(r);
       if (_normRespMul != null) {
         row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1];
       }
       if (Double.isNaN(row.response[row.response.length - i])) row.bad = true;
     }
   }
   return rows;
 }
예제 #23
0
파일: DRF.java 프로젝트: liangexiang/h2o-3
      @Override
      public void map(Chunk[] chks) {
        final Chunk y = importance ? chk_resp(chks) : null; // Response
        final double[] rpred = importance ? new double[1 + _nclass] : null; // Row prediction
        final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data
        final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees
        // Iterate over all rows
        for (int row = 0; row < oobt._len; row++) {
          final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow((int) chk_nids(chks, 0).at8(row));

          // For all tree (i.e., k-classes)
          for (int k = 0; k < _nclass; k++) {
            final DTree tree = _trees[k];
            if (tree == null) continue; // Empty class is ignored
            final Chunk nids = chk_nids(chks, k); // Node-ids  for this tree/class
            int nid = (int) nids.at8(row); // Get Node to decide from
            // Update only out-of-bag rows
            // This is out-of-bag row - but we would like to track on-the-fly prediction for the row
            if (wasOOBRow) {
              final Chunk ct =
                  chk_tree(chks, k); // k-tree working column holding votes for given row
              nid = ScoreBuildHistogram.oob2Nid(nid);
              if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree
              nid = tree.node(nid).pid(); // Then take parent's decision
              int leafnid;
              if (tree.root() instanceof LeafNode) {
                leafnid = 0;
              } else {
                DecidedNode dn = tree.decided(nid); // Must have a decision point
                if (dn._split.col() == -1) // Unable to decide?
                dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision
                leafnid = dn.ns(chks, row); // Decide down to a leafnode
              }
              // Setup Tree(i) - on the fly prediction of i-tree for row-th row
              //   - for classification: cumulative number of votes for this row
              //   - for regression: cumulative sum of prediction of each tree - has to be
              // normalized by number of trees
              double prediction =
                  ((LeafNode) tree.node(leafnid))
                      .pred(); // Prediction for this k-class and this row
              if (importance)
                rpred[1 + k] = (float) prediction; // for both regression and classification
              ct.set(row, (float) (ct.atd(row) + prediction));
            }
            // reset help column for this row and this k-class
            nids.set(row, 0);
          } /* end of k-trees iteration */
          // For this tree this row is out-of-bag - i.e., a tree voted for this row
          if (wasOOBRow) oobt.set(row, oobt.atd(row) + 1); // track number of trees
          if (importance) {
            if (wasOOBRow && !y.isNA(row)) {
              if (isClassifier()) {
                int treePred = getPrediction(rpred, data_row(chks, row, rowdata), _threshold);
                int actuPred = (int) y.at8(row);
                if (treePred == actuPred) rightVotes++; // No miss !
              } else { // regression
                double treePred = rpred[1];
                double actuPred = y.atd(row);
                sse += (actuPred - treePred) * (actuPred - treePred);
              }
              allRows++;
            }
          }
        }
      }
예제 #24
0
 @Override
 public void map(Chunk ca, Chunk cp) {
   // classification
   if (_c_len > 1) {
     _cm = new long[_c_len + 1][_c_len + 1];
     int len =
         Math.min(
             ca._len,
             cp._len); // handle different lenghts, but the vectors should have been rejected
     // already
     for (int i = 0; i < len; i++) {
       int a = ca.isNA0(i) ? _c_len : (int) ca.at80(i);
       int p = cp.isNA0(i) ? _c_len : (int) cp.at80(i);
       _cm[a][p]++;
     }
     if (len < ca._len)
       for (int i = len; i < ca._len; i++)
         _cm[ca.isNA0(i) ? _c_len : (int) ca.at80(i)][_c_len]++;
     if (len < cp._len)
       for (int i = len; i < cp._len; i++)
         _cm[_c_len][cp.isNA0(i) ? _c_len : (int) cp.at80(i)]++;
   } else {
     _cm = null;
     _mse = 0;
     assert (ca._len == cp._len);
     int len = ca._len;
     for (int i = 0; i < len; i++) {
       if (ca.isNA0(i) || cp.isNA0(i)) continue; // TODO: Improve
       final double a = ca.at0(i);
       final double p = cp.at0(i);
       _mse += (p - a) * (p - a);
       _count++;
     }
   }
 }
예제 #25
0
파일: DataInfo.java 프로젝트: liaochy/h2o-3
 public Rows rows(Chunk[] chks) {
   int cnt = 0;
   for (Chunk c : chks) if (c.isSparse()) ++cnt;
   return rows(chks, cnt > (chks.length >> 1));
 }
예제 #26
0
파일: DataInfo.java 프로젝트: liaochy/h2o-3
  /**
   * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used
   * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only)
   * Essentially turns the dataset 90 degrees.
   *
   * @param chunks - chunk of dataset
   * @param offset - adjustment for 0s if running with on-the-fly standardization (i.e. zeros are
   *     not really zeros because of centering)
   * @return array of sparse rows
   */
  public final Row[] extractSparseRows(Chunk[] chunks, double offset) {
    Row[] rows = new Row[chunks[0]._len];

    for (int i = 0; i < rows.length; ++i) {
      rows[i] = new Row(true, Math.min(_nums, 16), _cats, _responses, offset);
      rows[i].rid = chunks[0].start() + i;
      if (_offset) {
        rows[i].offset = chunks[offsetChunkId()].atd(i);
        if (Double.isNaN(rows[i].offset)) rows[i].bad = true;
      }
      if (_weights) {
        rows[i].weight = chunks[weightChunkId()].atd(i);
        if (Double.isNaN(rows[i].weight)) rows[i].bad = true;
      }
    }
    // categoricals
    for (int i = 0; i < _cats; ++i) {
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        if (chunks[i].isNA(r)) {
          if (_skipMissing) {
            row.bad = true;
          } else
            row.binIds[row.nBins++] =
                _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor
        } else {
          int c = getCategoricalId(i, (int) chunks[i].at8(r));
          if (c >= 0) row.binIds[row.nBins++] = c;
        }
      }
    }
    int numStart = numStart();
    // generic numbers
    for (int cid = 0; cid < _nums; ++cid) {
      Chunk c = chunks[_cats + cid];
      int oldRow = -1;
      for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) {
        if (c.atd(r) == 0) continue;
        assert r > oldRow;
        oldRow = r;
        Row row = rows[r];
        if (row.bad) continue;
        if (c.isNA(r)) row.bad = _skipMissing;
        double d = c.atd(r);
        if (_normMul != null) d *= _normMul[cid];
        row.addNum(cid + numStart, d);
      }
    }
    // response(s)
    for (int i = 1; i <= _responses; ++i) {
      Chunk rChunk = chunks[responseChunkId()];
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        row.response[row.response.length - i] = rChunk.atd(r);
        if (_normRespMul != null) {
          row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1];
        }
        if (Double.isNaN(row.response[row.response.length - i])) row.bad = true;
      }
    }
    return rows;
  }
예제 #27
0
파일: DRF.java 프로젝트: rohit2412/h2o
 @Override
 public void map(Chunk[] chks) {
   final Chunk y = importance ? chk_resp(chks) : null; // Response
   final float[] rpred = importance ? new float[1 + _nclass] : null; // Row prediction
   final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data
   final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees
   // Iterate over all rows
   for (int row = 0; row < oobt._len; row++) {
     boolean wasOOBRow = false;
     // For all tree (i.e., k-classes)
     for (int k = 0; k < _nclass; k++) {
       final DTree tree = _trees[k];
       if (tree == null) continue; // Empty class is ignored
       // If we have all constant responses, then we do not split even the
       // root and the residuals should be zero.
       if (tree.root() instanceof LeafNode) continue;
       final Chunk nids = chk_nids(chks, k); // Node-ids  for this tree/class
       final Chunk ct = chk_tree(chks, k); // k-tree working column holding votes for given row
       int nid = (int) nids.at80(row); // Get Node to decide from
       // Update only out-of-bag rows
       // This is out-of-bag row - but we would like to track on-the-fly prediction for the row
       if (isOOBRow(nid)) { // The row should be OOB for all k-trees !!!
         assert k == 0 || wasOOBRow
             : "Something is wrong: k-class trees oob row computing is broken! All k-trees should agree on oob row!";
         wasOOBRow = true;
         nid = oob2Nid(nid);
         if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree
         nid = tree.node(nid).pid(); // Then take parent's decision
         DecidedNode dn = tree.decided(nid); // Must have a decision point
         if (dn._split.col() == -1) // Unable to decide?
         dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision
         int leafnid = dn.ns(chks, row); // Decide down to a leafnode
         // Setup Tree(i) - on the fly prediction of i-tree for row-th row
         //   - for classification: cumulative number of votes for this row
         //   - for regression: cumulative sum of prediction of each tree - has to be normalized
         // by number of trees
         double prediction =
             ((LeafNode) tree.node(leafnid)).pred(); // Prediction for this k-class and this row
         if (importance)
           rpred[1 + k] = (float) prediction; // for both regression and classification
         ct.set0(row, (float) (ct.at0(row) + prediction));
         // For this tree this row is out-of-bag - i.e., a tree voted for this row
         oobt.set0(
             row,
             _nclass > 1
                 ? 1
                 : oobt.at0(row)
                     + 1); // for regression track number of trees, for classification boolean
                           // flag is enough
       }
       // reset help column for this row and this k-class
       nids.set0(row, 0);
     } /* end of k-trees iteration */
     if (importance) {
       if (wasOOBRow && !y.isNA0(row)) {
         if (classification) {
           int treePred = ModelUtils.getPrediction(rpred, data_row(chks, row, rowdata));
           int actuPred = (int) y.at80(row);
           if (treePred == actuPred) rightVotes++; // No miss !
         } else { // regression
           float treePred = rpred[1];
           float actuPred = (float) y.at0(row);
           sse += (actuPred - treePred) * (actuPred - treePred);
         }
         allRows++;
       }
     }
   }
 }
예제 #28
0
파일: AstCumu.java 프로젝트: h2oai/h2o-3
 @Override
 public void map(Chunk c, NewChunk nc) {
   double acc = _init;
   for (int i = 0; i < c._len; ++i) nc.addNum(acc = op(acc, c.atd(i)));
   _chkCumu[c.cidx()] = acc;
 }
예제 #29
0
파일: MRUtils.java 프로젝트: Jrobinso09/h2o
 @Override
 public void map(Chunk ys) {
   _ys = new long[_nclass];
   for (int i = 0; i < ys._len; i++) if (!ys.isNA0(i)) _ys[(int) ys.at80(i)]++;
 }