Example #1
0
  /**
   * Extract (dense) rows from given chunks, one Vec at a time - should be slightly faster than
   * per-row
   *
   * @param chunks - chunk of dataset
   * @return array of dense rows
   */
  public final Row[] extractDenseRowsVertical(Chunk[] chunks) {
    Row[] rows = new Row[chunks[0]._len];

    for (int i = 0; i < rows.length; ++i) {
      rows[i] = new Row(false, _nums, _cats, _responses, 0);
      rows[i].rid = chunks[0].start() + i;
      if (_offset) {
        rows[i].offset = chunks[offsetChunkId()].atd(i);
        if (Double.isNaN(rows[i].offset)) rows[i].bad = true;
      }
      if (_weights) {
        rows[i].weight = chunks[weightChunkId()].atd(i);
        if (Double.isNaN(rows[i].weight)) rows[i].bad = true;
      }
    }
    for (int i = 0; i < _cats; ++i) {
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        if (chunks[i].isNA(r)) {
          if (_skipMissing) {
            row.bad = true;
          } else
            row.binIds[row.nBins++] =
                _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor
        } else {
          int c = getCategoricalId(i, (int) chunks[i].at8(r));
          if (c >= 0) row.binIds[row.nBins++] = c;
        }
      }
    }
    int numStart = numStart();
    // generic numbers
    for (int cid = 0; cid < _nums; ++cid) {
      Chunk c = chunks[_cats + cid];
      for (int r = 0; r < c._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        if (c.isNA(r)) row.bad = _skipMissing;
        double d = c.atd(r);
        if (_normMul != null && _normSub != null) // either none or both
        d = (d - _normSub[cid]) * _normMul[cid];
        row.numVals[numStart + cid] = d;
      }
    }
    // response(s)
    for (int i = 1; i <= _responses; ++i) {
      Chunk rChunk = chunks[responseChunkId()];
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        row.response[row.response.length - i] = rChunk.atd(r);
        if (_normRespMul != null) {
          row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1];
        }
        if (Double.isNaN(row.response[row.response.length - i])) row.bad = true;
      }
    }
    return rows;
  }
Example #2
0
 @Override
 public void map(Chunk response, Chunk weight, Chunk offset) {
   for (int i = 0; i < response._len; ++i) {
     if (response.isNA(i)) continue;
     double w = weight.atd(i);
     if (w == 0) continue;
     double y = response.atd(i);
     double o = offset.atd(i);
     _num += _dist.initFNum(w, o, y);
     _denom += _dist.initFDenom(w, o);
   }
 }
Example #3
0
 @Override
 public void map(Chunk c, Chunk w) {
   for (int i = 0; i < c.len(); ++i)
     if (!c.isNA(i)) {
       double wt = w.atd(i);
       //          For now: let the user give small weights, results are probably not very good
       // (same as for wtd.quantile in R)
       //          if (wt > 0 && wt < 1) throw new H2OIllegalArgumentException("Quantiles only
       // accepts weights that are either 0 or >= 1.");
       sum += wt;
     }
 }
Example #4
0
 @Override
 public void map(Chunk chks[]) {
   Chunk cy = chk_resp(chks);
   for (int i = 0; i < cy._len; i++) {
     if (cy.isNA(i)) continue;
     if (isClassifier()) {
       int cls = (int) cy.at8(i);
       chk_work(chks, cls).set(i, 1L);
     } else {
       float pred = (float) cy.atd(i);
       chk_work(chks, 0).set(i, pred);
     }
   }
 }
Example #5
0
  public final Row extractDenseRow(Chunk[] chunks, int rid, Row row) {
    row.bad = false;
    row.rid = rid + chunks[0].start();
    if (_weights) row.weight = chunks[weightChunkId()].atd(rid);
    if (row.weight == 0) return row;
    if (_skipMissing)
      for (Chunk c : chunks)
        if (c.isNA(rid)) {
          row.bad = true;
          return row;
        }
    int nbins = 0;
    for (int i = 0; i < _cats; ++i) {
      if (chunks[i].isNA(rid)) {
        if (_imputeMissing) {
          int c = getCategoricalId(i, _catModes[i]);
          if (c >= 0) row.binIds[nbins++] = c;
        } else // TODO: What if missingBucket = false?
        row.binIds[nbins++] =
              _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor
      } else {
        int c = getCategoricalId(i, (int) chunks[i].at8(rid));
        if (c >= 0) row.binIds[nbins++] = c;
      }
    }
    row.nBins = nbins;
    final int n = _nums;
    for (int i = 0; i < n; ++i) {
      double d = chunks[_cats + i].atd(rid); // can be NA if skipMissing() == false
      if (_imputeMissing && Double.isNaN(d)) d = _numMeans[i];
      if (_normMul != null && _normSub != null) d = (d - _normSub[i]) * _normMul[i];
      row.numVals[i] = d;
    }
    for (int i = 0; i < _responses; ++i) {
      row.response[i] = chunks[responseChunkId()].atd(rid);
      if (_normRespMul != null)
        row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i];
      if (Double.isNaN(row.response[i])) {
        row.bad = true;
        return row;
      }
    }
    if (_offset) row.offset = chunks[offsetChunkId()].atd(rid);

    return row;
  }
Example #6
0
 @Override
 public void map(Chunk cs) {
   int idx = _chunkOffset + cs.cidx();
   Key ckey = Vec.chunkKey(_v._key, idx);
   if (_cmap != null) {
     assert !cs.hasFloat()
         : "Input chunk (" + cs.getClass() + ") has float, but is expected to be categorical";
     NewChunk nc = new NewChunk(_v, idx);
     // loop over rows and update ints for new domain mapping according to vecs[c].domain()
     for (int r = 0; r < cs._len; ++r) {
       if (cs.isNA(r)) nc.addNA();
       else nc.addNum(_cmap[(int) cs.at8(r)], 0);
     }
     nc.close(_fs);
   } else {
     DKV.put(ckey, cs.deepCopy(), _fs, true);
   }
 }
Example #7
0
      @Override
      public void map(Chunk[] chks) {
        final Chunk y = importance ? chk_resp(chks) : null; // Response
        final double[] rpred = importance ? new double[1 + _nclass] : null; // Row prediction
        final double[] rowdata = importance ? new double[_ncols] : null; // Pre-allocated row data
        final Chunk oobt = chk_oobt(chks); // Out-of-bag rows counter over all trees
        // Iterate over all rows
        for (int row = 0; row < oobt._len; row++) {
          final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow((int) chk_nids(chks, 0).at8(row));

          // For all tree (i.e., k-classes)
          for (int k = 0; k < _nclass; k++) {
            final DTree tree = _trees[k];
            if (tree == null) continue; // Empty class is ignored
            final Chunk nids = chk_nids(chks, k); // Node-ids  for this tree/class
            int nid = (int) nids.at8(row); // Get Node to decide from
            // Update only out-of-bag rows
            // This is out-of-bag row - but we would like to track on-the-fly prediction for the row
            if (wasOOBRow) {
              final Chunk ct =
                  chk_tree(chks, k); // k-tree working column holding votes for given row
              nid = ScoreBuildHistogram.oob2Nid(nid);
              if (tree.node(nid) instanceof UndecidedNode) // If we bottomed out the tree
              nid = tree.node(nid).pid(); // Then take parent's decision
              int leafnid;
              if (tree.root() instanceof LeafNode) {
                leafnid = 0;
              } else {
                DecidedNode dn = tree.decided(nid); // Must have a decision point
                if (dn._split.col() == -1) // Unable to decide?
                dn = tree.decided(tree.node(nid).pid()); // Then take parent's decision
                leafnid = dn.ns(chks, row); // Decide down to a leafnode
              }
              // Setup Tree(i) - on the fly prediction of i-tree for row-th row
              //   - for classification: cumulative number of votes for this row
              //   - for regression: cumulative sum of prediction of each tree - has to be
              // normalized by number of trees
              double prediction =
                  ((LeafNode) tree.node(leafnid))
                      .pred(); // Prediction for this k-class and this row
              if (importance)
                rpred[1 + k] = (float) prediction; // for both regression and classification
              ct.set(row, (float) (ct.atd(row) + prediction));
            }
            // reset help column for this row and this k-class
            nids.set(row, 0);
          } /* end of k-trees iteration */
          // For this tree this row is out-of-bag - i.e., a tree voted for this row
          if (wasOOBRow) oobt.set(row, oobt.atd(row) + 1); // track number of trees
          if (importance) {
            if (wasOOBRow && !y.isNA(row)) {
              if (isClassifier()) {
                int treePred = getPrediction(rpred, data_row(chks, row, rowdata), _threshold);
                int actuPred = (int) y.at8(row);
                if (treePred == actuPred) rightVotes++; // No miss !
              } else { // regression
                double treePred = rpred[1];
                double actuPred = y.atd(row);
                sse += (actuPred - treePred) * (actuPred - treePred);
              }
              allRows++;
            }
          }
        }
      }
Example #8
0
 @Override
 public void map(Chunk chks[]) {
   Chunk ys = chk_resp(chks);
   for (int row = 0; row < ys._len; row++)
     if (ys.isNA(row)) for (int t = 0; t < _nclass; t++) chk_nids(chks, t).set(row, -1);
 }
Example #9
0
  /**
   * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used
   * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only)
   * Essentially turns the dataset 90 degrees.
   *
   * @param chunks - chunk of dataset
   * @param offset - adjustment for 0s if running with on-the-fly standardization (i.e. zeros are
   *     not really zeros because of centering)
   * @return array of sparse rows
   */
  public final Row[] extractSparseRows(Chunk[] chunks, double offset) {
    Row[] rows = new Row[chunks[0]._len];

    for (int i = 0; i < rows.length; ++i) {
      rows[i] = new Row(true, Math.min(_nums, 16), _cats, _responses, offset);
      rows[i].rid = chunks[0].start() + i;
      if (_offset) {
        rows[i].offset = chunks[offsetChunkId()].atd(i);
        if (Double.isNaN(rows[i].offset)) rows[i].bad = true;
      }
      if (_weights) {
        rows[i].weight = chunks[weightChunkId()].atd(i);
        if (Double.isNaN(rows[i].weight)) rows[i].bad = true;
      }
    }
    // categoricals
    for (int i = 0; i < _cats; ++i) {
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        if (chunks[i].isNA(r)) {
          if (_skipMissing) {
            row.bad = true;
          } else
            row.binIds[row.nBins++] =
                _catOffsets[i + 1] - 1; // missing value turns into extra (last) factor
        } else {
          int c = getCategoricalId(i, (int) chunks[i].at8(r));
          if (c >= 0) row.binIds[row.nBins++] = c;
        }
      }
    }
    int numStart = numStart();
    // generic numbers
    for (int cid = 0; cid < _nums; ++cid) {
      Chunk c = chunks[_cats + cid];
      int oldRow = -1;
      for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) {
        if (c.atd(r) == 0) continue;
        assert r > oldRow;
        oldRow = r;
        Row row = rows[r];
        if (row.bad) continue;
        if (c.isNA(r)) row.bad = _skipMissing;
        double d = c.atd(r);
        if (_normMul != null) d *= _normMul[cid];
        row.addNum(cid + numStart, d);
      }
    }
    // response(s)
    for (int i = 1; i <= _responses; ++i) {
      Chunk rChunk = chunks[responseChunkId()];
      for (int r = 0; r < chunks[0]._len; ++r) {
        Row row = rows[r];
        if (row.bad) continue;
        row.response[row.response.length - i] = rChunk.atd(r);
        if (_normRespMul != null) {
          row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1];
        }
        if (Double.isNaN(row.response[row.response.length - i])) row.bad = true;
      }
    }
    return rows;
  }
Example #10
0
 /**
  * Extract (sparse) rows from given chunks. Note: 0 remains 0 - _normSub of DataInfo isn't used
  * (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only)
  * Essentially turns the dataset 90 degrees.
  *
  * @param chunks - chunk of dataset
  * @return array of sparse rows
  */
 public final Row[] extractSparseRows(Chunk[] chunks) {
   Row[] rows = new Row[chunks[0]._len];
   long startOff = chunks[0].start();
   for (int i = 0; i < rows.length; ++i) {
     rows[i] =
         new Row(
             true,
             Math.min(_nums, 16),
             _cats,
             _responses,
             i,
             startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not
     // use numNums()
     rows[i].rid = chunks[0].start() + i;
     if (_offset) {
       rows[i].offset = chunks[offsetChunkId()].atd(i);
       if (Double.isNaN(rows[i].offset)) rows[i].bad = true;
     }
     if (_weights) {
       rows[i].weight = chunks[weightChunkId()].atd(i);
       if (Double.isNaN(rows[i].weight)) rows[i].bad = true;
     }
     if (_skipMissing) {
       int N = _cats + _nums;
       for (int c = 0; c < N; ++c) if (chunks[c].isNA(i)) rows[i].bad = true;
     }
   }
   // categoricals
   for (int i = 0; i < _cats; ++i) {
     for (int r = 0; r < chunks[0]._len; ++r) {
       Row row = rows[r];
       if (row.bad) continue;
       int cid = getCategoricalId(i, chunks[i].isNA(r) ? _catModes[i] : (int) chunks[i].at8(r));
       if (cid >= 0) row.binIds[row.nBins++] = cid;
     }
   }
   // generic numbers + interactions
   int interactionOffset = 0;
   for (int cid = 0; cid < _nums; ++cid) {
     Chunk c = chunks[_cats + cid];
     int oldRow = -1;
     if (c
         instanceof
         InteractionWrappedVec
             .InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot'
       // all other values are off (i.e., are 0)
       for (int r = 0;
           r < c._len;
           ++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has
         // one, and only one, value)
         Row row = rows[r];
         if (row.bad) continue;
         if (c.isNA(r)) row.bad = _skipMissing;
         int cidVirtualOffset =
             getInteractionOffset(
                 chunks, _cats + cid, r); // the "virtual" offset into the hot-expanded interaction
         row.addNum(
             _numOffsets[cid] + cidVirtualOffset,
             c.atd(r)); // FIXME: if this produces a "true" NA then should sub with mean? with?
       }
       interactionOffset += nextNumericIdx(cid);
     } else {
       for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) {
         if (c.atd(r) == 0) continue;
         assert r > oldRow;
         oldRow = r;
         Row row = rows[r];
         if (row.bad) continue;
         if (c.isNA(r)) row.bad = _skipMissing;
         double d = c.atd(r);
         if (Double.isNaN(d)) d = _numMeans[cid];
         if (_normMul != null) d *= _normMul[interactionOffset];
         row.addNum(_numOffsets[cid], d);
       }
       interactionOffset++;
     }
   }
   // response(s)
   for (int i = 1; i <= _responses; ++i) {
     int rid = responseChunkId(i - 1);
     Chunk rChunk = chunks[rid];
     for (int r = 0; r < chunks[0]._len; ++r) {
       Row row = rows[r];
       if (row.bad) continue;
       row.response[i - 1] = rChunk.atd(r);
       if (_normRespMul != null) {
         row.response[i - 1] = (row.response[i - 1] - _normRespSub[i - 1]) * _normRespMul[i - 1];
       }
       if (Double.isNaN(row.response[row.response.length - i])) row.bad = true;
     }
   }
   return rows;
 }