Exemple #1
0
  // Slow-path append string
  private void append2slowstr() {
    // In case of all NAs and then a string, convert NAs to string NAs
    if (_xs != null) {
      _xs = null;
      _ls = null;
      alloc_str_indices(sparseLen());
      Arrays.fill(_is, -1);
    }

    if (_is != null && _is.length > 0) {
      // Check for sparseness
      if (_id == null) {
        int nzs = 0; // assume one non-null for the element currently being stored
        for (int i : _is) if (i != -1) ++nzs;
        if ((nzs + 1) * _sparseRatio < _len) set_sparse(nzs);
      } else {
        if ((_sparseRatio * (_sparseLen) >> 1) > _len) cancel_sparse();
        else _id = MemoryManager.arrayCopyOf(_id, _sparseLen << 1);
      }

      _is = MemoryManager.arrayCopyOf(_is, sparseLen() << 1);
      /* initialize the memory extension with -1s */
      for (int i = sparseLen(); i < _is.length; i++) _is[i] = -1;
    } else {
      _is = MemoryManager.malloc4(4);
      /* initialize everything with -1s */
      for (int i = 0; i < _is.length; i++) _is[i] = -1;
      if (sparse()) alloc_indices(4);
    }
    assert sparseLen() == 0 || _is.length > sparseLen()
        : "_ls.length = " + _is.length + ", _len = " + sparseLen();
  }
Exemple #2
0
 @Override
 boolean setNA_impl(int i) {
   if (isNA_impl(i)) return true;
   if (sparseLen() != _len) {
     int idx = Arrays.binarySearch(_id, 0, sparseLen(), i);
     if (idx >= 0) i = idx;
     else cancel_sparse(); // todo - do not necessarily cancel sparse here
   }
   return setNA_impl2(i);
 }
Exemple #3
0
 // Set & At on NewChunks are weird: only used after inflating some other
 // chunk.  At this point the NewChunk is full size, no more appends allowed,
 // and the xs exponent array should be only full of zeros.  Accesses must be
 // in-range and refer to the inflated values of the original Chunk.
 @Override
 boolean set_impl(int i, long l) {
   if (_ds != null) return set_impl(i, (double) l);
   if (sparseLen() != _len) { // sparse?
     int idx = Arrays.binarySearch(_id, 0, sparseLen(), i);
     if (idx >= 0) i = idx;
     else cancel_sparse(); // for now don't bother setting the sparse value
   }
   _ls[i] = l;
   _xs[i] = 0;
   _naCnt = -1;
   return true;
 }
Exemple #4
0
 // Fast-path append long data
 void append2(long l, int x) {
   if (_ls == null || _len >= _ls.length) append2slow();
   if (_len2 != _len) { // Sparse?
     if (x != 0) cancel_sparse(); // NA?  Give it up!
     else if (l == 0) {
       _len2++;
       return;
     } // Just One More Zero
     else x = _len2; // NZ: set the row over the xs field
   }
   _ls[_len] = l;
   _xs[_len++] = x;
   _len2++;
 }
Exemple #5
0
 @Override
 boolean set_impl(int i, String str) {
   if (_is == null && _len > 0) {
     assert sparseLen() == 0;
     alloc_str_indices(_len);
     Arrays.fill(_is, -1);
   }
   if (sparseLen() != _len) { // sparse?
     int idx = Arrays.binarySearch(_id, 0, sparseLen(), i);
     if (idx >= 0) i = idx;
     else cancel_sparse(); // for now don't bother setting the sparse value
   }
   _is[i] = _sslen;
   append_ss(str);
   return true;
 }
Exemple #6
0
 @Override
 public boolean set_impl(int i, double d) {
   if (_ds == null) {
     assert sparseLen() == 0 || _ls != null;
     switch_to_doubles();
   }
   if (sparseLen() != _len) { // sparse?
     int idx = Arrays.binarySearch(_id, 0, sparseLen(), i);
     if (idx >= 0) i = idx;
     else cancel_sparse(); // for now don't bother setting the sparse value
   }
   assert i < sparseLen();
   _ds[i] = d;
   _naCnt = -1;
   return true;
 }
Exemple #7
0
  // Append all of 'nc' onto the current NewChunk.  Kill nc.
  public void add(NewChunk nc) {
    assert _cidx >= 0;
    assert sparseLen() <= _len;
    assert nc.sparseLen() <= nc._len : "_len = " + nc.sparseLen() + ", _len2 = " + nc._len;
    if (nc._len == 0) return;
    if (_len == 0) {
      _ls = nc._ls;
      nc._ls = null;
      _xs = nc._xs;
      nc._xs = null;
      _id = nc._id;
      nc._id = null;
      _ds = nc._ds;
      nc._ds = null;
      _is = nc._is;
      nc._is = null;
      _ss = nc._ss;
      nc._ss = null;
      set_sparseLen(nc.sparseLen());
      set_len(nc._len);
      return;
    }
    if (nc.sparse() != sparse()) { // for now, just make it dense
      cancel_sparse();
      nc.cancel_sparse();
    }
    if (_ds != null) throw H2O.fail();
    while (sparseLen() + nc.sparseLen() >= _xs.length)
      _xs = MemoryManager.arrayCopyOf(_xs, _xs.length << 1);
    _ls = MemoryManager.arrayCopyOf(_ls, _xs.length);
    System.arraycopy(nc._ls, 0, _ls, sparseLen(), nc.sparseLen());
    System.arraycopy(nc._xs, 0, _xs, sparseLen(), nc.sparseLen());
    if (_id != null) {
      assert nc._id != null;
      _id = MemoryManager.arrayCopyOf(_id, _xs.length);
      System.arraycopy(nc._id, 0, _id, sparseLen(), nc.sparseLen());
      for (int i = sparseLen(); i < sparseLen() + nc.sparseLen(); ++i) _id[i] += _len;
    } else assert nc._id == null;

    set_sparseLen(sparseLen() + nc.sparseLen());
    set_len(_len + nc._len);
    nc._ls = null;
    nc._xs = null;
    nc._id = null;
    nc.set_sparseLen(nc.set_len(0));
    assert sparseLen() <= _len;
  }
Exemple #8
0
 // Compute a compressed integer buffer
 private byte[] bufX(long bias, int scale, int off, int log) {
   if (_len2 != _len) cancel_sparse();
   byte[] bs = new byte[(_len2 << log) + off];
   for (int i = 0; i < _len; i++) {
     if (isNA(i)) {
       switch (log) {
         case 0:
           bs[i + off] = (byte) (C1Chunk._NA);
           break;
         case 1:
           UDP.set2(bs, (i << 1) + off, (short) C2Chunk._NA);
           break;
         case 2:
           UDP.set4(bs, (i << 2) + off, (int) C4Chunk._NA);
           break;
         case 3:
           UDP.set8(bs, (i << 3) + off, C8Chunk._NA);
           break;
         default:
           H2O.fail();
       }
     } else {
       int x = (_xs[i] == Integer.MIN_VALUE + 1 ? 0 : _xs[i]) - scale;
       long le = x >= 0 ? _ls[i] * DParseTask.pow10i(x) : _ls[i] / DParseTask.pow10i(-x);
       le -= bias;
       switch (log) {
         case 0:
           bs[i + off] = (byte) le;
           break;
         case 1:
           UDP.set2(bs, (i << 1) + off, (short) le);
           break;
         case 2:
           UDP.set4(bs, (i << 2) + off, (int) le);
           break;
         case 3:
           UDP.set8(bs, (i << 3) + off, le);
           break;
         default:
           H2O.fail();
       }
     }
   }
   return bs;
 }
Exemple #9
0
  // Slow-path append data
  private void append2slow() {
    if (sparseLen() > FileVec.DFLT_CHUNK_SIZE)
      throw new ArrayIndexOutOfBoundsException(sparseLen());

    assert _ds == null;
    if (_ls != null && _ls.length > 0) {
      if (_id == null) { // check for sparseness
        int nzs = 0;
        for (int i = 0; i < _ls.length; ++i) if (_ls[i] != 0 || _xs[i] != 0) ++nzs;
        if ((nzs + 1) * _sparseRatio < _len) {
          set_sparse(nzs);
          assert sparseLen() == 0 || sparseLen() <= _ls.length
              : "_len = "
                  + sparseLen()
                  + ", _ls.length = "
                  + _ls.length
                  + ", nzs = "
                  + nzs
                  + ", len2 = "
                  + _len;
          assert _id.length == _ls.length;
          assert sparseLen() <= _len;
          return;
        }
      } else {
        // verify we're still sufficiently sparse
        if ((_sparseRatio * (sparseLen()) >> 1) > _len) cancel_sparse();
        else _id = MemoryManager.arrayCopyOf(_id, sparseLen() << 1);
      }
      _ls = MemoryManager.arrayCopyOf(_ls, sparseLen() << 1);
      _xs = MemoryManager.arrayCopyOf(_xs, sparseLen() << 1);
    } else {
      alloc_mantissa(4);
      alloc_exponent(4);
      if (_id != null) alloc_indices(4);
    }
    assert sparseLen() == 0 || sparseLen() < _ls.length
        : "_len = " + sparseLen() + ", _ls.length = " + _ls.length;
    assert _id == null || _id.length == _ls.length;
    assert sparseLen() <= _len;
  }
Exemple #10
0
  private Chunk compress2() {
    // Check for basic mode info: all missing or all strings or mixed stuff
    byte mode = type();
    if (mode == Vec.T_BAD) // ALL NAs, nothing to do
    return new C0DChunk(Double.NaN, sparseLen());
    if (mode == Vec.T_STR) return new CStrChunk(_sslen, _ss, sparseLen(), _len, _is, _isAllASCII);
    boolean rerun = false;
    if (mode == Vec.T_CAT) {
      for (int i = 0; i < sparseLen(); i++)
        if (isCategorical2(i)) _xs[i] = 0;
        else if (!isNA2(i)) {
          setNA_impl2(i);
          ++_naCnt;
        }
      // Smack any mismatched string/numbers
    } else if (mode == Vec.T_NUM) {
      for (int i = 0; i < sparseLen(); i++)
        if (isCategorical2(i)) {
          setNA_impl2(i);
          rerun = true;
        }
    }
    if (rerun) {
      _naCnt = -1;
      type();
    } // Re-run rollups after dropping all numbers/categoricals

    boolean sparse = false;
    // sparse? treat as sparse iff we have at least MIN_SPARSE_RATIOx more zeros than nonzeros
    if (_sparseRatio * (_naCnt + _nzCnt) < _len) {
      set_sparse(_naCnt + _nzCnt);
      sparse = true;
    } else if (sparseLen() != _len) cancel_sparse();

    // If the data is UUIDs there's not much compression going on
    if (_ds != null && _ls != null) return chunkUUID();
    // cut out the easy all NaNs case
    if (_naCnt == _len) return new C0DChunk(Double.NaN, _len);
    // If the data was set8 as doubles, we do a quick check to see if it's
    // plain longs.  If not, we give up and use doubles.
    if (_ds != null) {
      int i; // check if we can flip to ints
      for (i = 0; i < sparseLen(); ++i)
        if (!Double.isNaN(_ds[i]) && (double) (long) _ds[i] != _ds[i]) break;
      boolean isInteger = i == sparseLen();
      boolean isConstant = !sparse || sparseLen() == 0;
      double constVal = 0;
      if (!sparse) { // check the values, sparse with some nonzeros can not be constant - has 0s and
        // (at least 1) nonzero
        constVal = _ds[0];
        for (int j = 1; j < _len; ++j)
          if (_ds[j] != constVal) {
            isConstant = false;
            break;
          }
      }
      if (isConstant)
        return isInteger ? new C0LChunk((long) constVal, _len) : new C0DChunk(constVal, _len);
      if (!isInteger) return sparse ? new CXDChunk(_len, sparseLen(), 8, bufD(8)) : chunkD();
      // Else flip to longs
      _ls = new long[_ds.length];
      _xs = new int[_ds.length];
      double[] ds = _ds;
      _ds = null;
      final int naCnt = _naCnt;
      for (i = 0; i < sparseLen(); i++) // Inject all doubles into longs
      if (Double.isNaN(ds[i])) setNA_impl2(i);
        else _ls[i] = (long) ds[i];
      // setNA_impl2 will set _naCnt to -1!
      // we already know what the naCnt is (it did not change!) so set it back to correct value
      _naCnt = naCnt;
    }

    // IF (_len > _sparseLen) THEN Sparse
    // Check for compressed *during appends*.  Here we know:
    // - No specials; _xs[]==0.
    // - No floats; _ds==null
    // - NZ length in _sparseLen, actual length in _len.
    // - Huge ratio between _len and _sparseLen, and we do NOT want to inflate to
    //   the larger size; we need to keep it all small all the time.
    // - Rows in _xs

    // Data in some fixed-point format, not doubles
    // See if we can sanely normalize all the data to the same fixed-point.
    int xmin = Integer.MAX_VALUE; // min exponent found
    boolean floatOverflow = false;
    double min = Double.POSITIVE_INFINITY;
    double max = Double.NEGATIVE_INFINITY;
    int p10iLength = PrettyPrint.powers10i.length;
    long llo = Long.MAX_VALUE, lhi = Long.MIN_VALUE;
    int xlo = Integer.MAX_VALUE, xhi = Integer.MIN_VALUE;

    for (int i = 0; i < sparseLen(); i++) {
      if (isNA2(i)) continue;
      long l = _ls[i];
      int x = _xs[i];
      assert x != Integer.MIN_VALUE : "l = " + l + ", x = " + x;
      if (x == Integer.MIN_VALUE + 1) x = 0; // Replace categorical flag with no scaling
      assert l != 0 || x == 0
          : "l == 0 while x = "
              + x
              + " ls = "
              + Arrays.toString(_ls); // Exponent of zero is always zero
      long t; // Remove extra scaling
      while (l != 0 && (t = l / 10) * 10 == l) {
        l = t;
        x++;
      }
      // Compute per-chunk min/max
      double d = l * PrettyPrint.pow10(x);
      if (d < min) {
        min = d;
        llo = l;
        xlo = x;
      }
      if (d > max) {
        max = d;
        lhi = l;
        xhi = x;
      }
      floatOverflow = l < Integer.MIN_VALUE + 1 || l > Integer.MAX_VALUE;
      xmin = Math.min(xmin, x);
    }
    if (sparse) { // sparse?  then compare vs implied 0s
      if (min > 0) {
        min = 0;
        llo = 0;
        xlo = 0;
      }
      if (max < 0) {
        max = 0;
        lhi = 0;
        xhi = 0;
      }
      xmin = Math.min(xmin, 0);
    }
    // Constant column?
    if (_naCnt == 0 && (min == max)) {
      if (llo == lhi && xlo == 0 && xhi == 0) return new C0LChunk(llo, _len);
      else if ((long) min == min) return new C0LChunk((long) min, _len);
      else return new C0DChunk(min, _len);
    }

    // Compute min & max, as scaled integers in the xmin scale.
    // Check for overflow along the way
    boolean overflow = ((xhi - xmin) >= p10iLength) || ((xlo - xmin) >= p10iLength);
    long lemax = 0, lemin = 0;
    if (!overflow) { // Can at least get the power-of-10 without overflow
      long pow10 = PrettyPrint.pow10i(xhi - xmin);
      lemax = lhi * pow10;
      // Hacker's Delight, Section 2-13, checking overflow.
      // Note that the power-10 is always positive, so the test devolves this:
      if ((lemax / pow10) != lhi) overflow = true;
      // Note that xlo might be > xmin; e.g. { 101e-49 , 1e-48}.
      long pow10lo = PrettyPrint.pow10i(xlo - xmin);
      lemin = llo * pow10lo;
      if ((lemin / pow10lo) != llo) overflow = true;
    }

    // Boolean column?
    if (max == 1 && min == 0 && xmin == 0 && !overflow) {
      if (sparse) { // Very sparse?
        return _naCnt == 0
            ? new CX0Chunk(_len, sparseLen(), bufS(0)) // No NAs, can store as sparse bitvector
            : new CXIChunk(_len, sparseLen(), 1, bufS(1)); // have NAs, store as sparse 1byte values
      }

      int bpv = _catCnt + _naCnt > 0 ? 2 : 1; // Bit-vector
      byte[] cbuf = bufB(bpv);
      return new CBSChunk(cbuf, cbuf[0], cbuf[1]);
    }

    final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE;

    if (sparse) {
      if (fpoint) return new CXDChunk(_len, sparseLen(), 8, bufD(8));
      int sz = 8;
      if (Short.MIN_VALUE <= min && max <= Short.MAX_VALUE) sz = 2;
      else if (Integer.MIN_VALUE <= min && max <= Integer.MAX_VALUE) sz = 4;
      return new CXIChunk(_len, sparseLen(), sz, bufS(sz));
    }
    // Exponent scaling: replacing numbers like 1.3 with 13e-1.  '13' fits in a
    // byte and we scale the column by 0.1.  A set of numbers like
    // {1.2,23,0.34} then is normalized to always be represented with 2 digits
    // to the right: {1.20,23.00,0.34} and we scale by 100: {120,2300,34}.
    // This set fits in a 2-byte short.

    // We use exponent-scaling for bytes & shorts only; it's uncommon (and not
    // worth it) for larger numbers.  We need to get the exponents to be
    // uniform, so we scale up the largest lmax by the largest scale we need
    // and if that fits in a byte/short - then it's worth compressing.  Other
    // wise we just flip to a float or double representation.
    if (overflow || (fpoint && floatOverflow) || -35 > xmin || xmin > 35) return chunkD();
    final long leRange = leRange(lemin, lemax);
    if (fpoint) {
      if ((int) lemin == lemin && (int) lemax == lemax) {
        if (leRange < 255) // Fits in scaled biased byte?
        return new C1SChunk(bufX(lemin, xmin, C1SChunk._OFF, 0), lemin, PrettyPrint.pow10(xmin));
        if (leRange < 65535) { // we use signed 2B short, add -32k to the bias!
          long bias = 32767 + lemin;
          return new C2SChunk(bufX(bias, xmin, C2SChunk._OFF, 1), bias, PrettyPrint.pow10(xmin));
        }
      }
      if (leRange < 4294967295l) {
        long bias = 2147483647l + lemin;
        return new C4SChunk(bufX(bias, xmin, C4SChunk._OFF, 2), bias, PrettyPrint.pow10(xmin));
      }
      return chunkD();
    } // else an integer column

    // Compress column into a byte
    if (xmin == 0 && 0 <= lemin && lemax <= 255 && ((_naCnt + _catCnt) == 0))
      return new C1NChunk(bufX(0, 0, C1NChunk._OFF, 0));
    if (lemin < Integer.MIN_VALUE) return new C8Chunk(bufX(0, 0, 0, 3));
    if (leRange < 255) { // Span fits in a byte?
      if (0 <= min && max < 255) // Span fits in an unbiased byte?
      return new C1Chunk(bufX(0, 0, C1Chunk._OFF, 0));
      return new C1SChunk(bufX(lemin, xmin, C1SChunk._OFF, 0), lemin, PrettyPrint.pow10i(xmin));
    }

    // Compress column into a short
    if (leRange < 65535) { // Span fits in a biased short?
      if (xmin == 0
          && Short.MIN_VALUE < lemin
          && lemax <= Short.MAX_VALUE) // Span fits in an unbiased short?
      return new C2Chunk(bufX(0, 0, C2Chunk._OFF, 1));
      long bias = (lemin - (Short.MIN_VALUE + 1));
      return new C2SChunk(bufX(bias, xmin, C2SChunk._OFF, 1), bias, PrettyPrint.pow10i(xmin));
    }
    // Compress column into ints
    if (Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE) return new C4Chunk(bufX(0, 0, 0, 2));
    return new C8Chunk(bufX(0, 0, 0, 3));
  }