Beispiel #1
0
 protected void cancel_sparse() {
   if (sparseLen() != _len) {
     if (_is != null) {
       int[] is = MemoryManager.malloc4(_len);
       for (int i = 0; i < _len; i++) is[i] = -1;
       for (int i = 0; i < sparseLen(); i++) is[_id[i]] = _is[i];
       _is = is;
     } else if (_ds == null) {
       int[] xs = MemoryManager.malloc4(_len);
       long[] ls = MemoryManager.malloc8(_len);
       for (int i = 0; i < sparseLen(); ++i) {
         xs[_id[i]] = _xs[i];
         ls[_id[i]] = _ls[i];
       }
       _xs = xs;
       _ls = ls;
     } else {
       double[] ds = MemoryManager.malloc8d(_len);
       for (int i = 0; i < sparseLen(); ++i) ds[_id[i]] = _ds[i];
       _ds = ds;
     }
     set_sparseLen(_len);
   }
   _id = null;
 }
Beispiel #2
0
 public Row(boolean sparse, int nNums, int nBins, int nresponses, double etaOffset) {
   binIds = MemoryManager.malloc4(nBins);
   numVals = MemoryManager.malloc8d(nNums);
   response = MemoryManager.malloc8d(nresponses);
   if (sparse) numIds = MemoryManager.malloc4(nNums);
   this.etaOffset = etaOffset;
   this.nNums = sparse ? 0 : nNums;
 }
Beispiel #3
0
 public Row(boolean sparse, int nNums, int nBins, int nresponses, int i, long start) {
   binIds = MemoryManager.malloc4(nBins);
   numVals = MemoryManager.malloc8d(nNums);
   response = MemoryManager.malloc8d(nresponses);
   if (sparse) numIds = MemoryManager.malloc4(nNums);
   this.nNums = sparse ? 0 : nNums;
   cid = i;
   rid = start + i;
 }
Beispiel #4
0
 // filter the current active columns using the strong rules
 // note: strong rules are update so tha they keep all previous coefficients in, to prevent issues
 // with line-search
 private int[] activeCols(final double l1, final double l2, final double[] grad) {
   final double rhs = alpha[0] * (2 * l1 - l2);
   int[] cols = MemoryManager.malloc4(_dinfo.fullN());
   int selected = 0;
   int j = 0;
   if (_activeCols == null) _activeCols = new int[] {-1};
   for (int i = 0; i < _dinfo.fullN(); ++i)
     if ((j < _activeCols.length && i == _activeCols[j]) || grad[i] > rhs || grad[i] < -rhs) {
       cols[selected++] = i;
       if (j < _activeCols.length && i == _activeCols[j]) ++j;
     }
   if (!strong_rules_enabled || selected == _dinfo.fullN()) {
     _activeCols = null;
     _activeData._adaptedFrame = _dinfo._adaptedFrame;
     _activeData = _dinfo;
   } else {
     _activeCols = Arrays.copyOf(cols, selected);
     _activeData = _dinfo.filterExpandedColumns(_activeCols);
   }
   Log.info(
       "GLM2 strong rule at lambda="
           + l1
           + ", got "
           + selected
           + " active cols out of "
           + _dinfo.fullN()
           + " total.");
   return _activeCols;
 }
Beispiel #5
0
  // Slow-path append string
  private void append2slowstr() {
    // In case of all NAs and then a string, convert NAs to string NAs
    if (_xs != null) {
      _xs = null;
      _ls = null;
      alloc_str_indices(sparseLen());
      Arrays.fill(_is, -1);
    }

    if (_is != null && _is.length > 0) {
      // Check for sparseness
      if (_id == null) {
        int nzs = 0; // assume one non-null for the element currently being stored
        for (int i : _is) if (i != -1) ++nzs;
        if ((nzs + 1) * _sparseRatio < _len) set_sparse(nzs);
      } else {
        if ((_sparseRatio * (_sparseLen) >> 1) > _len) cancel_sparse();
        else _id = MemoryManager.arrayCopyOf(_id, _sparseLen << 1);
      }

      _is = MemoryManager.arrayCopyOf(_is, sparseLen() << 1);
      /* initialize the memory extension with -1s */
      for (int i = sparseLen(); i < _is.length; i++) _is[i] = -1;
    } else {
      _is = MemoryManager.malloc4(4);
      /* initialize everything with -1s */
      for (int i = 0; i < _is.length; i++) _is[i] = -1;
      if (sparse()) alloc_indices(4);
    }
    assert sparseLen() == 0 || _is.length > sparseLen()
        : "_ls.length = " + _is.length + ", _len = " + sparseLen();
  }
Beispiel #6
0
 public Submodel(
     double lambda,
     double[] beta,
     double[] norm_beta,
     long run_time,
     int iteration,
     boolean sparseCoef) {
   this.lambda_value = lambda;
   this.run_time = run_time;
   this.iteration = iteration;
   int r = 0;
   if (beta != null) {
     final double[] b = norm_beta != null ? norm_beta : beta;
     // grab the indeces of non-zero coefficients
     for (double d : beta) if (d != 0) ++r;
     idxs = MemoryManager.malloc4(sparseCoef ? r : beta.length);
     int j = 0;
     for (int i = 0; i < beta.length; ++i) if (!sparseCoef || beta[i] != 0) idxs[j++] = i;
     j = 0;
     this.beta = MemoryManager.malloc8d(idxs.length);
     for (int i : idxs) this.beta[j++] = beta[i];
     if (norm_beta != null) {
       j = 0;
       this.norm_beta = MemoryManager.malloc8d(idxs.length);
       for (int i : idxs) this.norm_beta[j++] = norm_beta[i];
     }
   } else idxs = null;
   rank = r;
   this.sparseCoef = sparseCoef;
 }
Beispiel #7
0
 // Slow-path append data
 private void append2slow() {
   if (_len > Vec.CHUNK_SZ) throw new ArrayIndexOutOfBoundsException(_len);
   assert _ds == null;
   if (_len2 == _len) { // Check for sparse-ness now & then
     int nzcnt = 0;
     for (int i = 0; i < _len; i++) {
       if (_ls[i] != 0) nzcnt++;
       if (_xs[i] != 0) {
         nzcnt = Vec.CHUNK_SZ;
         break;
       } // Only non-specials sparse
     }
     if (_len >= 32 && nzcnt * 8 <= _len) { // Heuristic for sparseness
       _len = 0;
       for (int i = 0; i < _len2; i++)
         if (_ls[i] != 0) {
           _xs[_len] = i; // Row number in xs
           _ls[_len++] = _ls[i]; // Sparse value in ls
         }
       return; // Compressed, so lots of room now
     }
   }
   _xs = _ls == null ? MemoryManager.malloc4(4) : MemoryManager.arrayCopyOf(_xs, _len << 1);
   _ls = _ls == null ? MemoryManager.malloc8(4) : MemoryManager.arrayCopyOf(_ls, _len << 1);
 }
Beispiel #8
0
 private void cancel_sparse() {
   long ls[] = MemoryManager.malloc8(_len2 + 1);
   for (int i = 0; i < _len; i++) // Inflate ls to hold values
   ls[_xs[i]] = _ls[i];
   _ls = ls;
   _xs = MemoryManager.malloc4(_len2 + 1);
   _len = _len2; // Not compressed now!
 }
Beispiel #9
0
 // private constructor called by filterExpandedColumns
 private DataInfo(
     DataInfo dinfo,
     Frame fr,
     double[] normMul,
     double[] normSub,
     int[][] catLevels,
     int[] catModes) {
   _fullCatOffsets = dinfo._catOffsets;
   if (!dinfo._useAllFactorLevels) {
     _fullCatOffsets = dinfo._catOffsets.clone();
     for (int i = 0; i < _fullCatOffsets.length; ++i)
       _fullCatOffsets[i] += i; // add for the skipped zeros.
   }
   _offset = dinfo._offset;
   _weights = dinfo._weights;
   _fold = dinfo._fold;
   _valid = false;
   _interactions = dinfo._interactions;
   _interactionVecs = dinfo._interactionVecs;
   assert dinfo._predictor_transform != null;
   assert dinfo._response_transform != null;
   _predictor_transform = dinfo._predictor_transform;
   _response_transform = dinfo._response_transform;
   _skipMissing = dinfo._skipMissing;
   _imputeMissing = dinfo._imputeMissing;
   _adaptedFrame = fr;
   _catOffsets = MemoryManager.malloc4(catLevels.length + 1);
   _catMissing = new boolean[catLevels.length];
   Arrays.fill(_catMissing, !(dinfo._imputeMissing || dinfo._skipMissing));
   int s = 0;
   for (int i = 0; i < catLevels.length; ++i) {
     _catOffsets[i] = s;
     s += catLevels[i].length;
   }
   _catLvls = catLevels;
   _catOffsets[_catOffsets.length - 1] = s;
   _responses = dinfo._responses;
   _cats = catLevels.length;
   _nums =
       fr.numCols()
           - _cats
           - dinfo._responses
           - (_offset ? 1 : 0)
           - (_weights ? 1 : 0)
           - (_fold ? 1 : 0);
   _numOffsets = _nums == 0 ? new int[0] : dinfo._numOffsets.clone();
   int diff = _numOffsets.length > 0 ? _numOffsets[0] - s : 0;
   for (int i = 0; i < _numOffsets.length; i++) // need to shift everyone down by the offset!
   _numOffsets[i] -= diff;
   _useAllFactorLevels = true; // dinfo._useAllFactorLevels;
   _numMeans = new double[_nums];
   _normMul = normMul;
   _normSub = normSub;
   _catModes = catModes;
   for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean();
 }
Beispiel #10
0
 public Row(
     boolean sparse, double[] numVals, int[] binIds, double[] response, double etaOffset) {
   int nNums = numVals == null ? 0 : numVals.length;
   this.numVals = numVals;
   if (sparse) numIds = MemoryManager.malloc4(nNums);
   this.etaOffset = etaOffset;
   this.nNums = sparse ? 0 : nNums;
   this.nBins = binIds == null ? 0 : binIds.length;
   this.binIds = binIds;
   this.response = response;
 }
Beispiel #11
0
 public Row(
     boolean sparse, double[] numVals, int[] binIds, double[] response, int i, long start) {
   int nNums = numVals == null ? 0 : numVals.length;
   this.numVals = numVals;
   if (sparse) numIds = MemoryManager.malloc4(nNums);
   this.nNums = sparse ? 0 : nNums;
   this.nBins = binIds == null ? 0 : binIds.length;
   this.binIds = binIds;
   this.response = response;
   cid = i;
   rid = start + i;
 }
Beispiel #12
0
 @Override
 NewChunk inflate_impl(NewChunk nc) {
   double dx = Math.log10(_scale);
   assert DParseTask.fitsIntoInt(dx);
   Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int) dx);
   nc._ls = MemoryManager.malloc8(_len);
   for (int i = 0; i < _len; i++) {
     int res = UDP.get2(_mem, (i << 1) + OFF);
     if (res == C2Chunk._NA) nc.setNA_impl2(i);
     else nc._ls[i] = res + _bias;
   }
   return nc;
 }
Beispiel #13
0
 // private constructor called by filterExpandedColumns
 private DataInfo(
     Key<DataInfo> selfKey,
     Frame fr,
     double[] normMul,
     double[] normSub,
     int[][] catLevels,
     int responses,
     TransformType predictor_transform,
     TransformType response_transform,
     boolean skipMissing,
     boolean imputeMissing,
     boolean weight,
     boolean offset,
     boolean fold) {
   super(selfKey);
   _offset = offset;
   _weights = weight;
   _fold = fold;
   _valid = false;
   assert predictor_transform != null;
   assert response_transform != null;
   _predictor_transform = predictor_transform;
   _response_transform = response_transform;
   _skipMissing = skipMissing;
   _imputeMissing = imputeMissing;
   _adaptedFrame = fr;
   _catOffsets = MemoryManager.malloc4(catLevels.length + 1);
   _catMissing = new int[catLevels.length];
   int s = 0;
   for (int i = 0; i < catLevels.length; ++i) {
     _catOffsets[i] = s;
     s += catLevels[i].length;
   }
   _catLvls = catLevels;
   _catOffsets[_catOffsets.length - 1] = s;
   _responses = responses;
   _cats = catLevels.length;
   _nums =
       fr.numCols() - _cats - responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0);
   _useAllFactorLevels = true;
   _catModes = new int[_cats];
   _numMeans = new double[_nums];
   _normMul = normMul;
   _normSub = normSub;
   for (int i = 0; i < _cats; i++) _catModes[i] = imputeCat(_adaptedFrame.vec(i));
   for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean();
 }
Beispiel #14
0
 public Submodel(double lambda, double[] beta, int iteration, double devTrain, double devTest) {
   this.lambda_value = lambda;
   this.iteration = iteration;
   this.devianceTrain = devTrain;
   this.devianceTest = devTest;
   this.betaMultinomial = null;
   int r = 0;
   if (beta != null) {
     // grab the indeces of non-zero coefficients
     for (int i = 0; i < beta.length; ++i) if (beta[i] != 0) ++r;
     if (r < beta.length) {
       idxs = MemoryManager.malloc4(r);
       int j = 0;
       for (int i = 0; i < beta.length; ++i) if (beta[i] != 0) idxs[j++] = i;
       this.beta = ArrayUtils.select(beta, idxs);
     } else {
       this.beta = beta.clone();
       idxs = null;
     }
   } else {
     this.beta = null;
     idxs = null;
   }
 }
Beispiel #15
0
    public DataInfo(Frame fr, int nResponses, boolean standardize, boolean standardize_response) {
      _nfolds = _foldId = 0;
      _standardize = standardize;
      _standardize_response = standardize_response;
      _responses = nResponses;
      final Vec[] vecs = fr.vecs();
      final int n = vecs.length - _responses;
      if (n < 1) throw new IllegalArgumentException("Training data must have at least one column.");
      int[] nums = MemoryManager.malloc4(n);
      int[] cats = MemoryManager.malloc4(n);
      int nnums = 0, ncats = 0;
      for (int i = 0; i < n; ++i) {
        if (vecs[i].isEnum()) cats[ncats++] = i;
        else nums[nnums++] = i;
      }
      _nums = nnums;
      _cats = ncats;
      // sort the cats in the decreasing order according to their size
      for (int i = 0; i < ncats; ++i)
        for (int j = i + 1; j < ncats; ++j)
          if (vecs[cats[i]].domain().length < vecs[cats[j]].domain().length) {
            int x = cats[i];
            cats[i] = cats[j];
            cats[j] = x;
          }
      Vec[] vecs2 = vecs.clone();
      String[] names = fr._names.clone();
      _catOffsets = MemoryManager.malloc4(ncats + 1);
      int len = _catOffsets[0] = 0;

      for (int i = 0; i < ncats; ++i) {
        Vec v = (vecs2[i] = vecs[cats[i]]);
        names[i] = fr._names[cats[i]];
        _catOffsets[i + 1] = (len += v.domain().length - 1);
      }
      if (standardize) {
        _normSub = MemoryManager.malloc8d(nnums);
        _normMul = MemoryManager.malloc8d(nnums);
        Arrays.fill(_normMul, 1);
      } else _normSub = _normMul = null;
      for (int i = 0; i < nnums; ++i) {
        Vec v = (vecs2[i + ncats] = vecs[nums[i]]);
        names[i + ncats] = fr._names[nums[i]];
        if (standardize) {
          _normSub[i] = v.mean();
          _normMul[i] = v.sigma() != 0 ? 1.0 / v.sigma() : 1.0;
        }
      }

      if (standardize_response) {
        _normRespSub = MemoryManager.malloc8d(_responses);
        _normRespMul = MemoryManager.malloc8d(_responses);
        Arrays.fill(_normRespMul, 1);
      } else _normRespSub = _normRespMul = null;
      for (int i = 0; i < _responses; ++i) {
        Vec v = (vecs2[nnums + ncats + i] = vecs[nnums + ncats + i]);
        if (standardize_response) {
          _normRespSub[i] = v.mean();
          _normRespMul[i] = v.sigma() != 0 ? 1.0 / v.sigma() : 1.0;
          //          Log.info("normalization for response[" + i + ": mul " + _normRespMul[i] + ",
          // sub " + _normRespSub[i]);
        }
      }
      _adaptedFrame = new Frame(names, vecs2);
      _adaptedFrame.reloadVecs();
    }
Beispiel #16
0
  /**
   * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality
   * greatest to least) with all numerical columns following. The response column(s) are placed at
   * the end.
   *
   * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum
   *
   * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo
   * should not have to care how these interactions are generated. Any heuristic using the fullN
   * value should continue functioning the same.
   *
   * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a
   * list of pairs of column indices with limited enums.
   */
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold,
      Model.InteractionPair[] interactions) {
    super(Key.<DataInfo>make());
    _valid = valid != null;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _interactions = interactions;

    // create dummy InteractionWrappedVecs and shove them onto the front
    if (_interactions != null) {
      _interactionVecs = new int[_interactions.length];
      train =
          Model.makeInteractions(
                  train,
                  false,
                  _interactions,
                  _useAllFactorLevels,
                  _skipMissing,
                  predictor_transform == TransformType.STANDARDIZE)
              .add(train);
      if (valid != null)
        valid =
            Model.makeInteractions(
                    valid,
                    true,
                    _interactions,
                    _useAllFactorLevels,
                    _skipMissing,
                    predictor_transform == TransformType.STANDARDIZE)
                .add(valid); // FIXME: should be using the training subs/muls!
    }

    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;

    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[ncats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[ncats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new boolean[ncats];
    int len = _catOffsets[0] = 0;
    int interactionIdx = 0; // simple index into the _interactionVecs array

    ArrayList<Integer> interactionIds;
    if (_interactions == null) {
      interactionIds = new ArrayList<>();
      for (int i = 0; i < tvecs.length; ++i)
        if (tvecs[i] instanceof InteractionWrappedVec) {
          interactionIds.add(i);
        }
      _interactionVecs = new int[interactionIds.size()];
      for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i);
    }
    for (int i = 0; i < ncats; ++i) {
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket; // needed for test time
      if (v instanceof InteractionWrappedVec) {
        if (_interactions != null) _interactions[interactionIdx].vecIdx = i;
        _interactionVecs[interactionIdx++] =
            i; // i (and not cats[i]) because this is the index in _adaptedFrame
        _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0));
      } else
        _catOffsets[i + 1] =
            (len +=
                v.domain().length
                    - (useAllFactorLevels ? 0 : 1)
                    + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
      _catModes[i] =
          imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100;
      _permutation[i] = cats[i];
    }
    _numMeans = new double[nnums];
    _numOffsets = MemoryManager.malloc4(nnums + 1);
    _numOffsets[0] = len;
    boolean isIWV; // is InteractionWrappedVec?
    for (int i = 0; i < nnums; ++i) {
      names[i + ncats] = train._names[nums[i]];
      Vec v = train.vec(nums[i]);
      tvecs2[i + ncats] = v;
      isIWV = v instanceof InteractionWrappedVec;
      if (isIWV) {
        if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats;
        _interactionVecs[interactionIdx++] = i + ncats;
      }
      _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1));
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + ncats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }
Beispiel #17
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return deep_clone();
   int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = (cols[i++] - _catOffsets[j]) + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   int[] catModes = _catModes;
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] cs = new int[_cats - ignoredCnt][];
     catModes = new int[_cats - ignoredCnt];
     int y = 0;
     for (int c = 0; c < catLvls.length; ++c)
       if (catLvls[c] != null) {
         catModes[y] = _catModes[c];
         cs[y++] = catLvls[c];
       }
     assert y == cs.length;
     catLvls = cs;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id - hasIcpt;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels,
   // TransformType predictor_transform, TransformType response_transform, boolean skipMissing,
   // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) {
   DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes);
   dinfo._activeCols = cols;
   return dinfo;
 }
Beispiel #18
0
 protected void set_sparse(int nzeros) {
   if (sparseLen() == nzeros && _len != 0) return;
   if (_id != null) { // we have sparse representation but some 0s in it!
     int[] id = MemoryManager.malloc4(nzeros);
     int j = 0;
     if (_ds != null) {
       double[] ds = MemoryManager.malloc8d(nzeros);
       for (int i = 0; i < sparseLen(); ++i) {
         if (_ds[i] != 0) {
           ds[j] = _ds[i];
           id[j] = _id[i];
           ++j;
         }
       }
       _ds = ds;
     } else if (_is != null) {
       int[] is = MemoryManager.malloc4(nzeros);
       for (int i = 0; i < sparseLen(); i++) {
         if (_is[i] != -1) {
           is[j] = _is[i];
           id[j] = id[i];
           ++j;
         }
       }
     } else {
       long[] ls = MemoryManager.malloc8(nzeros);
       int[] xs = MemoryManager.malloc4(nzeros);
       for (int i = 0; i < sparseLen(); ++i) {
         if (_ls[i] != 0) {
           ls[j] = _ls[i];
           xs[j] = _xs[i];
           id[j] = _id[i];
           ++j;
         }
       }
       _ls = ls;
       _xs = xs;
     }
     _id = id;
     assert j == nzeros;
     set_sparseLen(nzeros);
     return;
   }
   assert sparseLen() == _len
       : "_len = " + sparseLen() + ", _len2 = " + _len + ", nzeros = " + nzeros;
   int zs = 0;
   if (_is != null) {
     assert nzeros < _is.length;
     _id = MemoryManager.malloc4(_is.length);
     for (int i = 0; i < sparseLen(); i++) {
       if (_is[i] == -1) zs++;
       else {
         _is[i - zs] = _is[i];
         _id[i - zs] = i;
       }
     }
   } else if (_ds == null) {
     if (_len == 0) {
       _ls = new long[0];
       _xs = new int[0];
       _id = new int[0];
       set_sparseLen(0);
       return;
     } else {
       assert nzeros < sparseLen();
       _id = alloc_indices(_ls.length);
       for (int i = 0; i < sparseLen(); ++i) {
         if (_ls[i] == 0 && _xs[i] == 0) ++zs;
         else {
           _ls[i - zs] = _ls[i];
           _xs[i - zs] = _xs[i];
           _id[i - zs] = i;
         }
       }
     }
   } else {
     assert nzeros < _ds.length;
     _id = alloc_indices(_ds.length);
     for (int i = 0; i < sparseLen(); ++i) {
       if (_ds[i] == 0) ++zs;
       else {
         _ds[i - zs] = _ds[i];
         _id[i - zs] = i;
       }
     }
   }
   assert zs == (sparseLen() - nzeros);
   set_sparseLen(nzeros);
 }
Beispiel #19
0
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold) {
    super(Key.<DataInfo>make());
    _valid = false;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;
    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[_cats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[_cats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new int[ncats];
    int len = _catOffsets[0] = 0;
    for (int i = 0; i < ncats; ++i) {
      _catModes[i] = imputeCat(train.vec(cats[i]));
      _permutation[i] = cats[i];
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket ? 1 : 0; // needed for test time
      _catOffsets[i + 1] =
          (len +=
              v.domain().length
                  - (useAllFactorLevels ? 0 : 1)
                  + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
    }
    _numMeans = new double[_nums];
    for (int i = 0; i < _nums; ++i) {
      names[i + _cats] = train._names[nums[i]];
      tvecs2[i + _cats] = train.vec(nums[i]);
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + _cats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }
Beispiel #20
0
 int[] alloc_str_indices(int l) {
   return _is = MemoryManager.malloc4(l);
 }
Beispiel #21
0
 int[] alloc_exponent(int l) {
   return _xs = MemoryManager.malloc4(l);
 }
Beispiel #22
0
  /**
   * Extracts the values, applies regularization to numerics, adds appropriate offsets to
   * categoricals, and adapts response according to the CaseMode/CaseValue if set.
   */
  @Override
  public final void map(Chunk[] chunks, NewChunk[] outputs) {
    if (_job != null && _job.self() != null && !Job.isRunning(_job.self()))
      throw new JobCancelledException();
    final int nrows = chunks[0]._len;
    final long offset = chunks[0]._start;
    chunkInit();
    double[] nums = MemoryManager.malloc8d(_dinfo._nums);
    int[] cats = MemoryManager.malloc4(_dinfo._cats);
    double[] response = MemoryManager.malloc8d(_dinfo._responses);
    int start = 0;
    int end = nrows;

    boolean contiguous = false;
    Random skip_rng = null; // random generator for skipping rows
    if (_useFraction < 1.0) {
      skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong());
      if (contiguous) {
        final int howmany = (int) Math.ceil(_useFraction * nrows);
        if (howmany > 0) {
          start = skip_rng.nextInt(nrows - howmany);
          end = start + howmany;
        }
        assert (start < nrows);
        assert (end <= nrows);
      }
    }

    long[] shuf_map = null;
    if (_shuffle) {
      shuf_map = new long[end - start];
      for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i;
      Utils.shuffleArray(shuf_map, new Random().nextLong());
    }

    OUTER:
    for (int rr = start; rr < end; ++rr) {
      final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr;
      if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId)
          || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue;
      for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs!
      int i = 0, ncats = 0;
      for (; i < _dinfo._cats; ++i) {
        int c = (int) chunks[i].at80(r);
        if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1;
      }
      final int n = chunks.length - _dinfo._responses;
      for (; i < n; ++i) {
        double d = chunks[i].at0(r);
        if (_dinfo._normMul != null)
          d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats];
        nums[i - _dinfo._cats] = d;
      }
      for (i = 0; i < _dinfo._responses; ++i) {
        response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r);
        if (_dinfo._normRespMul != null)
          response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i];
      }
      if (outputs != null && outputs.length > 0)
        processRow(offset + r, nums, ncats, cats, response, outputs);
      else processRow(offset + r, nums, ncats, cats, response);
    }
    chunkDone();
  }
Beispiel #23
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return this;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = cols[i++] - _catOffsets[j] + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] c = new int[_cats - ignoredCnt][];
     int y = 0;
     for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl;
     assert y == c.length;
     catLvls = c;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   DataInfo dinfo =
       new DataInfo(
           _key,
           f,
           normMul,
           normSub,
           catLvls,
           _responses,
           _predictor_transform,
           _response_transform,
           _skipMissing,
           _imputeMissing,
           _weights,
           _offset,
           _fold);
   // do not put activeData into K/V - active data is recreated on each node based on active
   // columns
   dinfo._activeCols = cols;
   return dinfo;
 }