protected void cancel_sparse() { if (sparseLen() != _len) { if (_is != null) { int[] is = MemoryManager.malloc4(_len); for (int i = 0; i < _len; i++) is[i] = -1; for (int i = 0; i < sparseLen(); i++) is[_id[i]] = _is[i]; _is = is; } else if (_ds == null) { int[] xs = MemoryManager.malloc4(_len); long[] ls = MemoryManager.malloc8(_len); for (int i = 0; i < sparseLen(); ++i) { xs[_id[i]] = _xs[i]; ls[_id[i]] = _ls[i]; } _xs = xs; _ls = ls; } else { double[] ds = MemoryManager.malloc8d(_len); for (int i = 0; i < sparseLen(); ++i) ds[_id[i]] = _ds[i]; _ds = ds; } set_sparseLen(_len); } _id = null; }
public Row(boolean sparse, int nNums, int nBins, int nresponses, double etaOffset) { binIds = MemoryManager.malloc4(nBins); numVals = MemoryManager.malloc8d(nNums); response = MemoryManager.malloc8d(nresponses); if (sparse) numIds = MemoryManager.malloc4(nNums); this.etaOffset = etaOffset; this.nNums = sparse ? 0 : nNums; }
public Row(boolean sparse, int nNums, int nBins, int nresponses, int i, long start) { binIds = MemoryManager.malloc4(nBins); numVals = MemoryManager.malloc8d(nNums); response = MemoryManager.malloc8d(nresponses); if (sparse) numIds = MemoryManager.malloc4(nNums); this.nNums = sparse ? 0 : nNums; cid = i; rid = start + i; }
// filter the current active columns using the strong rules // note: strong rules are update so tha they keep all previous coefficients in, to prevent issues // with line-search private int[] activeCols(final double l1, final double l2, final double[] grad) { final double rhs = alpha[0] * (2 * l1 - l2); int[] cols = MemoryManager.malloc4(_dinfo.fullN()); int selected = 0; int j = 0; if (_activeCols == null) _activeCols = new int[] {-1}; for (int i = 0; i < _dinfo.fullN(); ++i) if ((j < _activeCols.length && i == _activeCols[j]) || grad[i] > rhs || grad[i] < -rhs) { cols[selected++] = i; if (j < _activeCols.length && i == _activeCols[j]) ++j; } if (!strong_rules_enabled || selected == _dinfo.fullN()) { _activeCols = null; _activeData._adaptedFrame = _dinfo._adaptedFrame; _activeData = _dinfo; } else { _activeCols = Arrays.copyOf(cols, selected); _activeData = _dinfo.filterExpandedColumns(_activeCols); } Log.info( "GLM2 strong rule at lambda=" + l1 + ", got " + selected + " active cols out of " + _dinfo.fullN() + " total."); return _activeCols; }
// Slow-path append string private void append2slowstr() { // In case of all NAs and then a string, convert NAs to string NAs if (_xs != null) { _xs = null; _ls = null; alloc_str_indices(sparseLen()); Arrays.fill(_is, -1); } if (_is != null && _is.length > 0) { // Check for sparseness if (_id == null) { int nzs = 0; // assume one non-null for the element currently being stored for (int i : _is) if (i != -1) ++nzs; if ((nzs + 1) * _sparseRatio < _len) set_sparse(nzs); } else { if ((_sparseRatio * (_sparseLen) >> 1) > _len) cancel_sparse(); else _id = MemoryManager.arrayCopyOf(_id, _sparseLen << 1); } _is = MemoryManager.arrayCopyOf(_is, sparseLen() << 1); /* initialize the memory extension with -1s */ for (int i = sparseLen(); i < _is.length; i++) _is[i] = -1; } else { _is = MemoryManager.malloc4(4); /* initialize everything with -1s */ for (int i = 0; i < _is.length; i++) _is[i] = -1; if (sparse()) alloc_indices(4); } assert sparseLen() == 0 || _is.length > sparseLen() : "_ls.length = " + _is.length + ", _len = " + sparseLen(); }
public Submodel( double lambda, double[] beta, double[] norm_beta, long run_time, int iteration, boolean sparseCoef) { this.lambda_value = lambda; this.run_time = run_time; this.iteration = iteration; int r = 0; if (beta != null) { final double[] b = norm_beta != null ? norm_beta : beta; // grab the indeces of non-zero coefficients for (double d : beta) if (d != 0) ++r; idxs = MemoryManager.malloc4(sparseCoef ? r : beta.length); int j = 0; for (int i = 0; i < beta.length; ++i) if (!sparseCoef || beta[i] != 0) idxs[j++] = i; j = 0; this.beta = MemoryManager.malloc8d(idxs.length); for (int i : idxs) this.beta[j++] = beta[i]; if (norm_beta != null) { j = 0; this.norm_beta = MemoryManager.malloc8d(idxs.length); for (int i : idxs) this.norm_beta[j++] = norm_beta[i]; } } else idxs = null; rank = r; this.sparseCoef = sparseCoef; }
// Slow-path append data private void append2slow() { if (_len > Vec.CHUNK_SZ) throw new ArrayIndexOutOfBoundsException(_len); assert _ds == null; if (_len2 == _len) { // Check for sparse-ness now & then int nzcnt = 0; for (int i = 0; i < _len; i++) { if (_ls[i] != 0) nzcnt++; if (_xs[i] != 0) { nzcnt = Vec.CHUNK_SZ; break; } // Only non-specials sparse } if (_len >= 32 && nzcnt * 8 <= _len) { // Heuristic for sparseness _len = 0; for (int i = 0; i < _len2; i++) if (_ls[i] != 0) { _xs[_len] = i; // Row number in xs _ls[_len++] = _ls[i]; // Sparse value in ls } return; // Compressed, so lots of room now } } _xs = _ls == null ? MemoryManager.malloc4(4) : MemoryManager.arrayCopyOf(_xs, _len << 1); _ls = _ls == null ? MemoryManager.malloc8(4) : MemoryManager.arrayCopyOf(_ls, _len << 1); }
private void cancel_sparse() { long ls[] = MemoryManager.malloc8(_len2 + 1); for (int i = 0; i < _len; i++) // Inflate ls to hold values ls[_xs[i]] = _ls[i]; _ls = ls; _xs = MemoryManager.malloc4(_len2 + 1); _len = _len2; // Not compressed now! }
// private constructor called by filterExpandedColumns private DataInfo( DataInfo dinfo, Frame fr, double[] normMul, double[] normSub, int[][] catLevels, int[] catModes) { _fullCatOffsets = dinfo._catOffsets; if (!dinfo._useAllFactorLevels) { _fullCatOffsets = dinfo._catOffsets.clone(); for (int i = 0; i < _fullCatOffsets.length; ++i) _fullCatOffsets[i] += i; // add for the skipped zeros. } _offset = dinfo._offset; _weights = dinfo._weights; _fold = dinfo._fold; _valid = false; _interactions = dinfo._interactions; _interactionVecs = dinfo._interactionVecs; assert dinfo._predictor_transform != null; assert dinfo._response_transform != null; _predictor_transform = dinfo._predictor_transform; _response_transform = dinfo._response_transform; _skipMissing = dinfo._skipMissing; _imputeMissing = dinfo._imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new boolean[catLevels.length]; Arrays.fill(_catMissing, !(dinfo._imputeMissing || dinfo._skipMissing)); int s = 0; for (int i = 0; i < catLevels.length; ++i) { _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length - 1] = s; _responses = dinfo._responses; _cats = catLevels.length; _nums = fr.numCols() - _cats - dinfo._responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0); _numOffsets = _nums == 0 ? new int[0] : dinfo._numOffsets.clone(); int diff = _numOffsets.length > 0 ? _numOffsets[0] - s : 0; for (int i = 0; i < _numOffsets.length; i++) // need to shift everyone down by the offset! _numOffsets[i] -= diff; _useAllFactorLevels = true; // dinfo._useAllFactorLevels; _numMeans = new double[_nums]; _normMul = normMul; _normSub = normSub; _catModes = catModes; for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean(); }
public Row( boolean sparse, double[] numVals, int[] binIds, double[] response, double etaOffset) { int nNums = numVals == null ? 0 : numVals.length; this.numVals = numVals; if (sparse) numIds = MemoryManager.malloc4(nNums); this.etaOffset = etaOffset; this.nNums = sparse ? 0 : nNums; this.nBins = binIds == null ? 0 : binIds.length; this.binIds = binIds; this.response = response; }
public Row( boolean sparse, double[] numVals, int[] binIds, double[] response, int i, long start) { int nNums = numVals == null ? 0 : numVals.length; this.numVals = numVals; if (sparse) numIds = MemoryManager.malloc4(nNums); this.nNums = sparse ? 0 : nNums; this.nBins = binIds == null ? 0 : binIds.length; this.binIds = binIds; this.response = response; cid = i; rid = start + i; }
@Override NewChunk inflate_impl(NewChunk nc) { double dx = Math.log10(_scale); assert DParseTask.fitsIntoInt(dx); Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int) dx); nc._ls = MemoryManager.malloc8(_len); for (int i = 0; i < _len; i++) { int res = UDP.get2(_mem, (i << 1) + OFF); if (res == C2Chunk._NA) nc.setNA_impl2(i); else nc._ls[i] = res + _bias; } return nc; }
// private constructor called by filterExpandedColumns private DataInfo( Key<DataInfo> selfKey, Frame fr, double[] normMul, double[] normSub, int[][] catLevels, int responses, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean weight, boolean offset, boolean fold) { super(selfKey); _offset = offset; _weights = weight; _fold = fold; _valid = false; assert predictor_transform != null; assert response_transform != null; _predictor_transform = predictor_transform; _response_transform = response_transform; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new int[catLevels.length]; int s = 0; for (int i = 0; i < catLevels.length; ++i) { _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length - 1] = s; _responses = responses; _cats = catLevels.length; _nums = fr.numCols() - _cats - responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0); _useAllFactorLevels = true; _catModes = new int[_cats]; _numMeans = new double[_nums]; _normMul = normMul; _normSub = normSub; for (int i = 0; i < _cats; i++) _catModes[i] = imputeCat(_adaptedFrame.vec(i)); for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean(); }
public Submodel(double lambda, double[] beta, int iteration, double devTrain, double devTest) { this.lambda_value = lambda; this.iteration = iteration; this.devianceTrain = devTrain; this.devianceTest = devTest; this.betaMultinomial = null; int r = 0; if (beta != null) { // grab the indeces of non-zero coefficients for (int i = 0; i < beta.length; ++i) if (beta[i] != 0) ++r; if (r < beta.length) { idxs = MemoryManager.malloc4(r); int j = 0; for (int i = 0; i < beta.length; ++i) if (beta[i] != 0) idxs[j++] = i; this.beta = ArrayUtils.select(beta, idxs); } else { this.beta = beta.clone(); idxs = null; } } else { this.beta = null; idxs = null; } }
public DataInfo(Frame fr, int nResponses, boolean standardize, boolean standardize_response) { _nfolds = _foldId = 0; _standardize = standardize; _standardize_response = standardize_response; _responses = nResponses; final Vec[] vecs = fr.vecs(); final int n = vecs.length - _responses; if (n < 1) throw new IllegalArgumentException("Training data must have at least one column."); int[] nums = MemoryManager.malloc4(n); int[] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for (int i = 0; i < n; ++i) { if (vecs[i].isEnum()) cats[ncats++] = i; else nums[nnums++] = i; } _nums = nnums; _cats = ncats; // sort the cats in the decreasing order according to their size for (int i = 0; i < ncats; ++i) for (int j = i + 1; j < ncats; ++j) if (vecs[cats[i]].domain().length < vecs[cats[j]].domain().length) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } Vec[] vecs2 = vecs.clone(); String[] names = fr._names.clone(); _catOffsets = MemoryManager.malloc4(ncats + 1); int len = _catOffsets[0] = 0; for (int i = 0; i < ncats; ++i) { Vec v = (vecs2[i] = vecs[cats[i]]); names[i] = fr._names[cats[i]]; _catOffsets[i + 1] = (len += v.domain().length - 1); } if (standardize) { _normSub = MemoryManager.malloc8d(nnums); _normMul = MemoryManager.malloc8d(nnums); Arrays.fill(_normMul, 1); } else _normSub = _normMul = null; for (int i = 0; i < nnums; ++i) { Vec v = (vecs2[i + ncats] = vecs[nums[i]]); names[i + ncats] = fr._names[nums[i]]; if (standardize) { _normSub[i] = v.mean(); _normMul[i] = v.sigma() != 0 ? 1.0 / v.sigma() : 1.0; } } if (standardize_response) { _normRespSub = MemoryManager.malloc8d(_responses); _normRespMul = MemoryManager.malloc8d(_responses); Arrays.fill(_normRespMul, 1); } else _normRespSub = _normRespMul = null; for (int i = 0; i < _responses; ++i) { Vec v = (vecs2[nnums + ncats + i] = vecs[nnums + ncats + i]); if (standardize_response) { _normRespSub[i] = v.mean(); _normRespMul[i] = v.sigma() != 0 ? 1.0 / v.sigma() : 1.0; // Log.info("normalization for response[" + i + ": mul " + _normRespMul[i] + ", // sub " + _normRespSub[i]); } } _adaptedFrame = new Frame(names, vecs2); _adaptedFrame.reloadVecs(); }
/** * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality * greatest to least) with all numerical columns following. The response column(s) are placed at * the end. * * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum * * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo * should not have to care how these interactions are generated. Any heuristic using the fullN * value should continue functioning the same. * * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a * list of pairs of column indices with limited enums. */ public DataInfo( Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold, Model.InteractionPair[] interactions) { super(Key.<DataInfo>make()); _valid = valid != null; assert predictor_transform != null; assert response_transform != null; _offset = offset; _weights = weight; _fold = fold; assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true"; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _interactions = interactions; // create dummy InteractionWrappedVecs and shove them onto the front if (_interactions != null) { _interactionVecs = new int[_interactions.length]; train = Model.makeInteractions( train, false, _interactions, _useAllFactorLevels, _skipMissing, predictor_transform == TransformType.STANDARDIZE) .add(train); if (valid != null) valid = Model.makeInteractions( valid, true, _interactions, _useAllFactorLevels, _skipMissing, predictor_transform == TransformType.STANDARDIZE) .add(valid); // FIXME: should be using the training subs/muls! } _permutation = new int[train.numCols()]; final Vec[] tvecs = train.vecs(); // Count categorical-vs-numerical final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0); int[] nums = MemoryManager.malloc4(n); int[] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for (int i = 0; i < n; ++i) if (tvecs[i].isCategorical()) cats[ncats++] = i; else nums[nnums++] = i; _nums = nnums; _cats = ncats; _catLvls = new int[ncats][]; // sort the cats in the decreasing order according to their size for (int i = 0; i < ncats; ++i) for (int j = i + 1; j < ncats; ++j) if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } String[] names = new String[train.numCols()]; Vec[] tvecs2 = new Vec[train.numCols()]; // Compute the cardinality of each cat _catModes = new int[ncats]; _catOffsets = MemoryManager.malloc4(ncats + 1); _catMissing = new boolean[ncats]; int len = _catOffsets[0] = 0; int interactionIdx = 0; // simple index into the _interactionVecs array ArrayList<Integer> interactionIds; if (_interactions == null) { interactionIds = new ArrayList<>(); for (int i = 0; i < tvecs.length; ++i) if (tvecs[i] instanceof InteractionWrappedVec) { interactionIds.add(i); } _interactionVecs = new int[interactionIds.size()]; for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i); } for (int i = 0; i < ncats; ++i) { names[i] = train._names[cats[i]]; Vec v = (tvecs2[i] = tvecs[cats[i]]); _catMissing[i] = missingBucket; // needed for test time if (v instanceof InteractionWrappedVec) { if (_interactions != null) _interactions[interactionIdx].vecIdx = i; _interactionVecs[interactionIdx++] = i; // i (and not cats[i]) because this is the index in _adaptedFrame _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0)); } else _catOffsets[i + 1] = (len += v.domain().length - (useAllFactorLevels ? 0 : 1) + (missingBucket ? 1 : 0)); // missing values turn into a new factor level _catModes[i] = imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100; _permutation[i] = cats[i]; } _numMeans = new double[nnums]; _numOffsets = MemoryManager.malloc4(nnums + 1); _numOffsets[0] = len; boolean isIWV; // is InteractionWrappedVec? for (int i = 0; i < nnums; ++i) { names[i + ncats] = train._names[nums[i]]; Vec v = train.vec(nums[i]); tvecs2[i + ncats] = v; isIWV = v instanceof InteractionWrappedVec; if (isIWV) { if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats; _interactionVecs[interactionIdx++] = i + ncats; } _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1)); _numMeans[i] = train.vec(nums[i]).mean(); _permutation[i + ncats] = nums[i]; } for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0); i < names.length; ++i) { names[i] = train._names[i]; tvecs2[i] = train.vec(i); } _adaptedFrame = new Frame(names, tvecs2); train.restructure(names, tvecs2); if (valid != null) valid.restructure(names, valid.vecs(names)); // _adaptedFrame = train; setPredictorTransform(predictor_transform); if (_responses > 0) setResponseTransform(response_transform); }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return deep_clone(); int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = (cols[i++] - _catOffsets[j]) + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } int[] catModes = _catModes; for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] cs = new int[_cats - ignoredCnt][]; catModes = new int[_cats - ignoredCnt]; int y = 0; for (int c = 0; c < catLvls.length; ++c) if (catLvls[c] != null) { catModes[y] = _catModes[c]; cs[y++] = catLvls[c]; } assert y == cs.length; catLvls = cs; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id - hasIcpt; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off]; } // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, // TransformType predictor_transform, TransformType response_transform, boolean skipMissing, // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes); dinfo._activeCols = cols; return dinfo; }
protected void set_sparse(int nzeros) { if (sparseLen() == nzeros && _len != 0) return; if (_id != null) { // we have sparse representation but some 0s in it! int[] id = MemoryManager.malloc4(nzeros); int j = 0; if (_ds != null) { double[] ds = MemoryManager.malloc8d(nzeros); for (int i = 0; i < sparseLen(); ++i) { if (_ds[i] != 0) { ds[j] = _ds[i]; id[j] = _id[i]; ++j; } } _ds = ds; } else if (_is != null) { int[] is = MemoryManager.malloc4(nzeros); for (int i = 0; i < sparseLen(); i++) { if (_is[i] != -1) { is[j] = _is[i]; id[j] = id[i]; ++j; } } } else { long[] ls = MemoryManager.malloc8(nzeros); int[] xs = MemoryManager.malloc4(nzeros); for (int i = 0; i < sparseLen(); ++i) { if (_ls[i] != 0) { ls[j] = _ls[i]; xs[j] = _xs[i]; id[j] = _id[i]; ++j; } } _ls = ls; _xs = xs; } _id = id; assert j == nzeros; set_sparseLen(nzeros); return; } assert sparseLen() == _len : "_len = " + sparseLen() + ", _len2 = " + _len + ", nzeros = " + nzeros; int zs = 0; if (_is != null) { assert nzeros < _is.length; _id = MemoryManager.malloc4(_is.length); for (int i = 0; i < sparseLen(); i++) { if (_is[i] == -1) zs++; else { _is[i - zs] = _is[i]; _id[i - zs] = i; } } } else if (_ds == null) { if (_len == 0) { _ls = new long[0]; _xs = new int[0]; _id = new int[0]; set_sparseLen(0); return; } else { assert nzeros < sparseLen(); _id = alloc_indices(_ls.length); for (int i = 0; i < sparseLen(); ++i) { if (_ls[i] == 0 && _xs[i] == 0) ++zs; else { _ls[i - zs] = _ls[i]; _xs[i - zs] = _xs[i]; _id[i - zs] = i; } } } } else { assert nzeros < _ds.length; _id = alloc_indices(_ds.length); for (int i = 0; i < sparseLen(); ++i) { if (_ds[i] == 0) ++zs; else { _ds[i - zs] = _ds[i]; _id[i - zs] = i; } } } assert zs == (sparseLen() - nzeros); set_sparseLen(nzeros); }
public DataInfo( Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { super(Key.<DataInfo>make()); _valid = false; assert predictor_transform != null; assert response_transform != null; _offset = offset; _weights = weight; _fold = fold; assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true"; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _permutation = new int[train.numCols()]; final Vec[] tvecs = train.vecs(); // Count categorical-vs-numerical final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0); int[] nums = MemoryManager.malloc4(n); int[] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for (int i = 0; i < n; ++i) if (tvecs[i].isCategorical()) cats[ncats++] = i; else nums[nnums++] = i; _nums = nnums; _cats = ncats; _catLvls = new int[_cats][]; // sort the cats in the decreasing order according to their size for (int i = 0; i < ncats; ++i) for (int j = i + 1; j < ncats; ++j) if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } String[] names = new String[train.numCols()]; Vec[] tvecs2 = new Vec[train.numCols()]; // Compute the cardinality of each cat _catModes = new int[_cats]; _catOffsets = MemoryManager.malloc4(ncats + 1); _catMissing = new int[ncats]; int len = _catOffsets[0] = 0; for (int i = 0; i < ncats; ++i) { _catModes[i] = imputeCat(train.vec(cats[i])); _permutation[i] = cats[i]; names[i] = train._names[cats[i]]; Vec v = (tvecs2[i] = tvecs[cats[i]]); _catMissing[i] = missingBucket ? 1 : 0; // needed for test time _catOffsets[i + 1] = (len += v.domain().length - (useAllFactorLevels ? 0 : 1) + (missingBucket ? 1 : 0)); // missing values turn into a new factor level } _numMeans = new double[_nums]; for (int i = 0; i < _nums; ++i) { names[i + _cats] = train._names[nums[i]]; tvecs2[i + _cats] = train.vec(nums[i]); _numMeans[i] = train.vec(nums[i]).mean(); _permutation[i + _cats] = nums[i]; } for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0); i < names.length; ++i) { names[i] = train._names[i]; tvecs2[i] = train.vec(i); } _adaptedFrame = new Frame(names, tvecs2); train.restructure(names, tvecs2); if (valid != null) valid.restructure(names, valid.vecs(names)); // _adaptedFrame = train; setPredictorTransform(predictor_transform); if (_responses > 0) setResponseTransform(response_transform); }
int[] alloc_str_indices(int l) { return _is = MemoryManager.malloc4(l); }
int[] alloc_exponent(int l) { return _xs = MemoryManager.malloc4(l); }
/** * Extracts the values, applies regularization to numerics, adds appropriate offsets to * categoricals, and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk[] chunks, NewChunk[] outputs) { if (_job != null && _job.self() != null && !Job.isRunning(_job.self())) throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0]._start; chunkInit(); double[] nums = MemoryManager.malloc8d(_dinfo._nums); int[] cats = MemoryManager.malloc4(_dinfo._cats); double[] response = MemoryManager.malloc8d(_dinfo._responses); int start = 0; int end = nrows; boolean contiguous = false; Random skip_rng = null; // random generator for skipping rows if (_useFraction < 1.0) { skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong()); if (contiguous) { final int howmany = (int) Math.ceil(_useFraction * nrows); if (howmany > 0) { start = skip_rng.nextInt(nrows - howmany); end = start + howmany; } assert (start < nrows); assert (end <= nrows); } } long[] shuf_map = null; if (_shuffle) { shuf_map = new long[end - start]; for (int i = 0; i < shuf_map.length; ++i) shuf_map[i] = start + i; Utils.shuffleArray(shuf_map, new Random().nextLong()); } OUTER: for (int rr = start; rr < end; ++rr) { final int r = shuf_map != null ? (int) shuf_map[rr - start] : rr; if ((_dinfo._nfolds > 0 && (r % _dinfo._nfolds) == _dinfo._foldId) || (skip_rng != null && skip_rng.nextFloat() > _useFraction)) continue; for (Chunk c : chunks) if (c.isNA0(r)) continue OUTER; // skip rows with NAs! int i = 0, ncats = 0; for (; i < _dinfo._cats; ++i) { int c = (int) chunks[i].at80(r); if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1; } final int n = chunks.length - _dinfo._responses; for (; i < n; ++i) { double d = chunks[i].at0(r); if (_dinfo._normMul != null) d = (d - _dinfo._normSub[i - _dinfo._cats]) * _dinfo._normMul[i - _dinfo._cats]; nums[i - _dinfo._cats] = d; } for (i = 0; i < _dinfo._responses; ++i) { response[i] = chunks[chunks.length - _dinfo._responses + i].at0(r); if (_dinfo._normRespMul != null) response[i] = (response[i] - _dinfo._normRespSub[i]) * _dinfo._normRespMul[i]; } if (outputs != null && outputs.length > 0) processRow(offset + r, nums, ncats, cats, response, outputs); else processRow(offset + r, nums, ncats, cats, response); } chunkDone(); }
public DataInfo filterExpandedColumns(int[] cols) { assert _predictor_transform != null; assert _response_transform != null; if (cols == null) return this; int i = 0, j = 0, ignoredCnt = 0; // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, // double [] normMul, double [] normRespSub, double [] normRespMul){ int[][] catLvls = new int[_cats][]; int[] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if (_catOffsets != null) { int coff = _useAllFactorLevels ? 0 : 1; while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) { int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = cols[i++] - _catOffsets[j] + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k; if (ignoredCnt > 0) { int[][] c = new int[_cats - ignoredCnt][]; int y = 0; for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl; assert y == c.length; catLvls = c; } // now numerics int prev = j = 0; for (; i < cols.length; ++i) { for (int k = prev; k < (cols[i] - numStart()); ++k) { ignoredCols[ignoredCnt++] = k + _cats; ++j; } prev = ++j; } for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats; Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone()); if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt)); assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols(); double[] normSub = null; double[] normMul = null; int id = Arrays.binarySearch(cols, numStart()); if (id < 0) id = -id - 1; int nnums = cols.length - id; int off = numStart(); if (_normSub != null) { normSub = new double[nnums]; for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off]; } if (_normMul != null) { normMul = new double[nnums]; for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off]; } DataInfo dinfo = new DataInfo( _key, f, normMul, normSub, catLvls, _responses, _predictor_transform, _response_transform, _skipMissing, _imputeMissing, _weights, _offset, _fold); // do not put activeData into K/V - active data is recreated on each node based on active // columns dinfo._activeCols = cols; return dinfo; }