Beispiel #1
0
 Col(String s, int rows, boolean isClass, int binLimit, boolean isFloat) {
   _name = s;
   _isFloat = isFloat;
   _isClass = isClass;
   _colBinLimit = binLimit;
   _isByte = false;
   _raw = MemoryManager.malloc4f(rows);
   _ignored = false;
 }
Beispiel #2
0
 /** For all columns - encode all floats as unique shorts. */
 void shrink() {
   if (_isByte) {
     _arity = 256;
     return; // do not shrink byte columns
   }
   float[] vs = _raw.clone();
   Arrays.sort(vs); // Sort puts all Float.NaN at the end of the array (according Float.NaN doc)
   int ndups = 0, i = 0, nans = 0; // Counter of all NaNs
   while (i < vs.length - 1) { // count dups
     int j = i + 1;
     if (isBadRaw(vs[i])) {
       nans = vs.length - i;
       break;
     } // skip all NaNs
     if (isBadRaw(vs[j])) {
       nans = vs.length - j;
       break;
     } // there is only one remaining NaN (do not forget on it)
     while (j < vs.length && vs[i] == vs[j]) {
       ++ndups;
       ++j;
     }
     i = j;
   }
   _invalidValues = nans;
   if (vs.length <= nans) {
     // to many NaNs in the column => ignore it
     _ignored = true;
     _raw = null;
     Log.info(Sys.RANDF, "Ignore column: " + this);
     return;
   }
   int n = vs.length - ndups - nans;
   int rem = n % _colBinLimit;
   int maxBinSize = (n > _colBinLimit) ? (n / _colBinLimit + Math.min(rem, 1)) : 1;
   // Assign shorts to floats, with binning.
   _binned2raw =
       MemoryManager.malloc4f(
           Math.min(n, _colBinLimit)); // if n is smaller than bin limit no need to compact
   int smax = 0, cntCurBin = 1;
   i = 0;
   _binned2raw[0] = vs[i];
   for (; i < vs.length; ++i) {
     if (isBadRaw(vs[i])) break; // the first NaN, there are only NaN in the rest of vs[] array
     if (vs[i] == _binned2raw[smax]) continue; // remove dups
     if (++cntCurBin > maxBinSize) {
       if (rem > 0 && --rem == 0) --maxBinSize; // check if we can reduce the bin size
       ++smax;
       cntCurBin = 1;
     }
     _binned2raw[smax] = vs[i];
   }
   ++smax;
   //      for(i = 0; i< vs.length; i++) if (!isBadRaw(vs[i])) break;
   // All Float.NaN are at the end of vs => min is stored in vs[0]
   _min = vs[0];
   for (i = vs.length - 1; i >= 0; i--) if (!isBadRaw(vs[i])) break;
   _max = vs[i];
   vs = null; // GCed
   _binned = MemoryManager.malloc2(_raw.length);
   // Find the bin value by lookup in bin2raw array which is sorted so we can do binary lookup.
   for (i = 0; i < _raw.length; i++)
     if (isBadRaw(_raw[i])) _binned[i] = BAD;
     else {
       short idx = (short) Arrays.binarySearch(_binned2raw, _raw[i]);
       if (idx >= 0) _binned[i] = idx;
       else
         _binned[i] =
             (short)
                 (-idx - 1); // this occurs when we are looking for a binned value, we return the
       // smaller value in the array.
       assert _binned[i] < _binned2raw.length;
     }
   if (n > _colBinLimit)
     Log.info(Sys.RANDF, this + " this column's arity was cut from " + n + " to " + smax);
   _arity = _binned2raw.length;
   _raw = null; // GCced
 }