Col(String s, int rows, boolean isClass, int binLimit, boolean isFloat) { _name = s; _isFloat = isFloat; _isClass = isClass; _colBinLimit = binLimit; _isByte = false; _raw = MemoryManager.malloc4f(rows); _ignored = false; }
/** For all columns - encode all floats as unique shorts. */ void shrink() { if (_isByte) { _arity = 256; return; // do not shrink byte columns } float[] vs = _raw.clone(); Arrays.sort(vs); // Sort puts all Float.NaN at the end of the array (according Float.NaN doc) int ndups = 0, i = 0, nans = 0; // Counter of all NaNs while (i < vs.length - 1) { // count dups int j = i + 1; if (isBadRaw(vs[i])) { nans = vs.length - i; break; } // skip all NaNs if (isBadRaw(vs[j])) { nans = vs.length - j; break; } // there is only one remaining NaN (do not forget on it) while (j < vs.length && vs[i] == vs[j]) { ++ndups; ++j; } i = j; } _invalidValues = nans; if (vs.length <= nans) { // to many NaNs in the column => ignore it _ignored = true; _raw = null; Log.info(Sys.RANDF, "Ignore column: " + this); return; } int n = vs.length - ndups - nans; int rem = n % _colBinLimit; int maxBinSize = (n > _colBinLimit) ? (n / _colBinLimit + Math.min(rem, 1)) : 1; // Assign shorts to floats, with binning. _binned2raw = MemoryManager.malloc4f( Math.min(n, _colBinLimit)); // if n is smaller than bin limit no need to compact int smax = 0, cntCurBin = 1; i = 0; _binned2raw[0] = vs[i]; for (; i < vs.length; ++i) { if (isBadRaw(vs[i])) break; // the first NaN, there are only NaN in the rest of vs[] array if (vs[i] == _binned2raw[smax]) continue; // remove dups if (++cntCurBin > maxBinSize) { if (rem > 0 && --rem == 0) --maxBinSize; // check if we can reduce the bin size ++smax; cntCurBin = 1; } _binned2raw[smax] = vs[i]; } ++smax; // for(i = 0; i< vs.length; i++) if (!isBadRaw(vs[i])) break; // All Float.NaN are at the end of vs => min is stored in vs[0] _min = vs[0]; for (i = vs.length - 1; i >= 0; i--) if (!isBadRaw(vs[i])) break; _max = vs[i]; vs = null; // GCed _binned = MemoryManager.malloc2(_raw.length); // Find the bin value by lookup in bin2raw array which is sorted so we can do binary lookup. for (i = 0; i < _raw.length; i++) if (isBadRaw(_raw[i])) _binned[i] = BAD; else { short idx = (short) Arrays.binarySearch(_binned2raw, _raw[i]); if (idx >= 0) _binned[i] = idx; else _binned[i] = (short) (-idx - 1); // this occurs when we are looking for a binned value, we return the // smaller value in the array. assert _binned[i] < _binned2raw.length; } if (n > _colBinLimit) Log.info(Sys.RANDF, this + " this column's arity was cut from " + n + " to " + smax); _arity = _binned2raw.length; _raw = null; // GCced }