// Slow-path append string private void append2slowstr() { // In case of all NAs and then a string, convert NAs to string NAs if (_xs != null) { _xs = null; _ls = null; alloc_str_indices(sparseLen()); Arrays.fill(_is, -1); } if (_is != null && _is.length > 0) { // Check for sparseness if (_id == null) { int nzs = 0; // assume one non-null for the element currently being stored for (int i : _is) if (i != -1) ++nzs; if ((nzs + 1) * _sparseRatio < _len) set_sparse(nzs); } else { if ((_sparseRatio * (_sparseLen) >> 1) > _len) cancel_sparse(); else _id = MemoryManager.arrayCopyOf(_id, _sparseLen << 1); } _is = MemoryManager.arrayCopyOf(_is, sparseLen() << 1); /* initialize the memory extension with -1s */ for (int i = sparseLen(); i < _is.length; i++) _is[i] = -1; } else { _is = MemoryManager.malloc4(4); /* initialize everything with -1s */ for (int i = 0; i < _is.length; i++) _is[i] = -1; if (sparse()) alloc_indices(4); } assert sparseLen() == 0 || _is.length > sparseLen() : "_ls.length = " + _is.length + ", _len = " + sparseLen(); }
@Override boolean setNA_impl(int i) { if (isNA_impl(i)) return true; if (sparseLen() != _len) { int idx = Arrays.binarySearch(_id, 0, sparseLen(), i); if (idx >= 0) i = idx; else cancel_sparse(); // todo - do not necessarily cancel sparse here } return setNA_impl2(i); }
// Set & At on NewChunks are weird: only used after inflating some other // chunk. At this point the NewChunk is full size, no more appends allowed, // and the xs exponent array should be only full of zeros. Accesses must be // in-range and refer to the inflated values of the original Chunk. @Override boolean set_impl(int i, long l) { if (_ds != null) return set_impl(i, (double) l); if (sparseLen() != _len) { // sparse? int idx = Arrays.binarySearch(_id, 0, sparseLen(), i); if (idx >= 0) i = idx; else cancel_sparse(); // for now don't bother setting the sparse value } _ls[i] = l; _xs[i] = 0; _naCnt = -1; return true; }
// Fast-path append long data void append2(long l, int x) { if (_ls == null || _len >= _ls.length) append2slow(); if (_len2 != _len) { // Sparse? if (x != 0) cancel_sparse(); // NA? Give it up! else if (l == 0) { _len2++; return; } // Just One More Zero else x = _len2; // NZ: set the row over the xs field } _ls[_len] = l; _xs[_len++] = x; _len2++; }
@Override boolean set_impl(int i, String str) { if (_is == null && _len > 0) { assert sparseLen() == 0; alloc_str_indices(_len); Arrays.fill(_is, -1); } if (sparseLen() != _len) { // sparse? int idx = Arrays.binarySearch(_id, 0, sparseLen(), i); if (idx >= 0) i = idx; else cancel_sparse(); // for now don't bother setting the sparse value } _is[i] = _sslen; append_ss(str); return true; }
@Override public boolean set_impl(int i, double d) { if (_ds == null) { assert sparseLen() == 0 || _ls != null; switch_to_doubles(); } if (sparseLen() != _len) { // sparse? int idx = Arrays.binarySearch(_id, 0, sparseLen(), i); if (idx >= 0) i = idx; else cancel_sparse(); // for now don't bother setting the sparse value } assert i < sparseLen(); _ds[i] = d; _naCnt = -1; return true; }
// Append all of 'nc' onto the current NewChunk. Kill nc. public void add(NewChunk nc) { assert _cidx >= 0; assert sparseLen() <= _len; assert nc.sparseLen() <= nc._len : "_len = " + nc.sparseLen() + ", _len2 = " + nc._len; if (nc._len == 0) return; if (_len == 0) { _ls = nc._ls; nc._ls = null; _xs = nc._xs; nc._xs = null; _id = nc._id; nc._id = null; _ds = nc._ds; nc._ds = null; _is = nc._is; nc._is = null; _ss = nc._ss; nc._ss = null; set_sparseLen(nc.sparseLen()); set_len(nc._len); return; } if (nc.sparse() != sparse()) { // for now, just make it dense cancel_sparse(); nc.cancel_sparse(); } if (_ds != null) throw H2O.fail(); while (sparseLen() + nc.sparseLen() >= _xs.length) _xs = MemoryManager.arrayCopyOf(_xs, _xs.length << 1); _ls = MemoryManager.arrayCopyOf(_ls, _xs.length); System.arraycopy(nc._ls, 0, _ls, sparseLen(), nc.sparseLen()); System.arraycopy(nc._xs, 0, _xs, sparseLen(), nc.sparseLen()); if (_id != null) { assert nc._id != null; _id = MemoryManager.arrayCopyOf(_id, _xs.length); System.arraycopy(nc._id, 0, _id, sparseLen(), nc.sparseLen()); for (int i = sparseLen(); i < sparseLen() + nc.sparseLen(); ++i) _id[i] += _len; } else assert nc._id == null; set_sparseLen(sparseLen() + nc.sparseLen()); set_len(_len + nc._len); nc._ls = null; nc._xs = null; nc._id = null; nc.set_sparseLen(nc.set_len(0)); assert sparseLen() <= _len; }
// Compute a compressed integer buffer private byte[] bufX(long bias, int scale, int off, int log) { if (_len2 != _len) cancel_sparse(); byte[] bs = new byte[(_len2 << log) + off]; for (int i = 0; i < _len; i++) { if (isNA(i)) { switch (log) { case 0: bs[i + off] = (byte) (C1Chunk._NA); break; case 1: UDP.set2(bs, (i << 1) + off, (short) C2Chunk._NA); break; case 2: UDP.set4(bs, (i << 2) + off, (int) C4Chunk._NA); break; case 3: UDP.set8(bs, (i << 3) + off, C8Chunk._NA); break; default: H2O.fail(); } } else { int x = (_xs[i] == Integer.MIN_VALUE + 1 ? 0 : _xs[i]) - scale; long le = x >= 0 ? _ls[i] * DParseTask.pow10i(x) : _ls[i] / DParseTask.pow10i(-x); le -= bias; switch (log) { case 0: bs[i + off] = (byte) le; break; case 1: UDP.set2(bs, (i << 1) + off, (short) le); break; case 2: UDP.set4(bs, (i << 2) + off, (int) le); break; case 3: UDP.set8(bs, (i << 3) + off, le); break; default: H2O.fail(); } } } return bs; }
// Slow-path append data private void append2slow() { if (sparseLen() > FileVec.DFLT_CHUNK_SIZE) throw new ArrayIndexOutOfBoundsException(sparseLen()); assert _ds == null; if (_ls != null && _ls.length > 0) { if (_id == null) { // check for sparseness int nzs = 0; for (int i = 0; i < _ls.length; ++i) if (_ls[i] != 0 || _xs[i] != 0) ++nzs; if ((nzs + 1) * _sparseRatio < _len) { set_sparse(nzs); assert sparseLen() == 0 || sparseLen() <= _ls.length : "_len = " + sparseLen() + ", _ls.length = " + _ls.length + ", nzs = " + nzs + ", len2 = " + _len; assert _id.length == _ls.length; assert sparseLen() <= _len; return; } } else { // verify we're still sufficiently sparse if ((_sparseRatio * (sparseLen()) >> 1) > _len) cancel_sparse(); else _id = MemoryManager.arrayCopyOf(_id, sparseLen() << 1); } _ls = MemoryManager.arrayCopyOf(_ls, sparseLen() << 1); _xs = MemoryManager.arrayCopyOf(_xs, sparseLen() << 1); } else { alloc_mantissa(4); alloc_exponent(4); if (_id != null) alloc_indices(4); } assert sparseLen() == 0 || sparseLen() < _ls.length : "_len = " + sparseLen() + ", _ls.length = " + _ls.length; assert _id == null || _id.length == _ls.length; assert sparseLen() <= _len; }
private Chunk compress2() { // Check for basic mode info: all missing or all strings or mixed stuff byte mode = type(); if (mode == Vec.T_BAD) // ALL NAs, nothing to do return new C0DChunk(Double.NaN, sparseLen()); if (mode == Vec.T_STR) return new CStrChunk(_sslen, _ss, sparseLen(), _len, _is, _isAllASCII); boolean rerun = false; if (mode == Vec.T_CAT) { for (int i = 0; i < sparseLen(); i++) if (isCategorical2(i)) _xs[i] = 0; else if (!isNA2(i)) { setNA_impl2(i); ++_naCnt; } // Smack any mismatched string/numbers } else if (mode == Vec.T_NUM) { for (int i = 0; i < sparseLen(); i++) if (isCategorical2(i)) { setNA_impl2(i); rerun = true; } } if (rerun) { _naCnt = -1; type(); } // Re-run rollups after dropping all numbers/categoricals boolean sparse = false; // sparse? treat as sparse iff we have at least MIN_SPARSE_RATIOx more zeros than nonzeros if (_sparseRatio * (_naCnt + _nzCnt) < _len) { set_sparse(_naCnt + _nzCnt); sparse = true; } else if (sparseLen() != _len) cancel_sparse(); // If the data is UUIDs there's not much compression going on if (_ds != null && _ls != null) return chunkUUID(); // cut out the easy all NaNs case if (_naCnt == _len) return new C0DChunk(Double.NaN, _len); // If the data was set8 as doubles, we do a quick check to see if it's // plain longs. If not, we give up and use doubles. if (_ds != null) { int i; // check if we can flip to ints for (i = 0; i < sparseLen(); ++i) if (!Double.isNaN(_ds[i]) && (double) (long) _ds[i] != _ds[i]) break; boolean isInteger = i == sparseLen(); boolean isConstant = !sparse || sparseLen() == 0; double constVal = 0; if (!sparse) { // check the values, sparse with some nonzeros can not be constant - has 0s and // (at least 1) nonzero constVal = _ds[0]; for (int j = 1; j < _len; ++j) if (_ds[j] != constVal) { isConstant = false; break; } } if (isConstant) return isInteger ? new C0LChunk((long) constVal, _len) : new C0DChunk(constVal, _len); if (!isInteger) return sparse ? new CXDChunk(_len, sparseLen(), 8, bufD(8)) : chunkD(); // Else flip to longs _ls = new long[_ds.length]; _xs = new int[_ds.length]; double[] ds = _ds; _ds = null; final int naCnt = _naCnt; for (i = 0; i < sparseLen(); i++) // Inject all doubles into longs if (Double.isNaN(ds[i])) setNA_impl2(i); else _ls[i] = (long) ds[i]; // setNA_impl2 will set _naCnt to -1! // we already know what the naCnt is (it did not change!) so set it back to correct value _naCnt = naCnt; } // IF (_len > _sparseLen) THEN Sparse // Check for compressed *during appends*. Here we know: // - No specials; _xs[]==0. // - No floats; _ds==null // - NZ length in _sparseLen, actual length in _len. // - Huge ratio between _len and _sparseLen, and we do NOT want to inflate to // the larger size; we need to keep it all small all the time. // - Rows in _xs // Data in some fixed-point format, not doubles // See if we can sanely normalize all the data to the same fixed-point. int xmin = Integer.MAX_VALUE; // min exponent found boolean floatOverflow = false; double min = Double.POSITIVE_INFINITY; double max = Double.NEGATIVE_INFINITY; int p10iLength = PrettyPrint.powers10i.length; long llo = Long.MAX_VALUE, lhi = Long.MIN_VALUE; int xlo = Integer.MAX_VALUE, xhi = Integer.MIN_VALUE; for (int i = 0; i < sparseLen(); i++) { if (isNA2(i)) continue; long l = _ls[i]; int x = _xs[i]; assert x != Integer.MIN_VALUE : "l = " + l + ", x = " + x; if (x == Integer.MIN_VALUE + 1) x = 0; // Replace categorical flag with no scaling assert l != 0 || x == 0 : "l == 0 while x = " + x + " ls = " + Arrays.toString(_ls); // Exponent of zero is always zero long t; // Remove extra scaling while (l != 0 && (t = l / 10) * 10 == l) { l = t; x++; } // Compute per-chunk min/max double d = l * PrettyPrint.pow10(x); if (d < min) { min = d; llo = l; xlo = x; } if (d > max) { max = d; lhi = l; xhi = x; } floatOverflow = l < Integer.MIN_VALUE + 1 || l > Integer.MAX_VALUE; xmin = Math.min(xmin, x); } if (sparse) { // sparse? then compare vs implied 0s if (min > 0) { min = 0; llo = 0; xlo = 0; } if (max < 0) { max = 0; lhi = 0; xhi = 0; } xmin = Math.min(xmin, 0); } // Constant column? if (_naCnt == 0 && (min == max)) { if (llo == lhi && xlo == 0 && xhi == 0) return new C0LChunk(llo, _len); else if ((long) min == min) return new C0LChunk((long) min, _len); else return new C0DChunk(min, _len); } // Compute min & max, as scaled integers in the xmin scale. // Check for overflow along the way boolean overflow = ((xhi - xmin) >= p10iLength) || ((xlo - xmin) >= p10iLength); long lemax = 0, lemin = 0; if (!overflow) { // Can at least get the power-of-10 without overflow long pow10 = PrettyPrint.pow10i(xhi - xmin); lemax = lhi * pow10; // Hacker's Delight, Section 2-13, checking overflow. // Note that the power-10 is always positive, so the test devolves this: if ((lemax / pow10) != lhi) overflow = true; // Note that xlo might be > xmin; e.g. { 101e-49 , 1e-48}. long pow10lo = PrettyPrint.pow10i(xlo - xmin); lemin = llo * pow10lo; if ((lemin / pow10lo) != llo) overflow = true; } // Boolean column? if (max == 1 && min == 0 && xmin == 0 && !overflow) { if (sparse) { // Very sparse? return _naCnt == 0 ? new CX0Chunk(_len, sparseLen(), bufS(0)) // No NAs, can store as sparse bitvector : new CXIChunk(_len, sparseLen(), 1, bufS(1)); // have NAs, store as sparse 1byte values } int bpv = _catCnt + _naCnt > 0 ? 2 : 1; // Bit-vector byte[] cbuf = bufB(bpv); return new CBSChunk(cbuf, cbuf[0], cbuf[1]); } final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; if (sparse) { if (fpoint) return new CXDChunk(_len, sparseLen(), 8, bufD(8)); int sz = 8; if (Short.MIN_VALUE <= min && max <= Short.MAX_VALUE) sz = 2; else if (Integer.MIN_VALUE <= min && max <= Integer.MAX_VALUE) sz = 4; return new CXIChunk(_len, sparseLen(), sz, bufS(sz)); } // Exponent scaling: replacing numbers like 1.3 with 13e-1. '13' fits in a // byte and we scale the column by 0.1. A set of numbers like // {1.2,23,0.34} then is normalized to always be represented with 2 digits // to the right: {1.20,23.00,0.34} and we scale by 100: {120,2300,34}. // This set fits in a 2-byte short. // We use exponent-scaling for bytes & shorts only; it's uncommon (and not // worth it) for larger numbers. We need to get the exponents to be // uniform, so we scale up the largest lmax by the largest scale we need // and if that fits in a byte/short - then it's worth compressing. Other // wise we just flip to a float or double representation. if (overflow || (fpoint && floatOverflow) || -35 > xmin || xmin > 35) return chunkD(); final long leRange = leRange(lemin, lemax); if (fpoint) { if ((int) lemin == lemin && (int) lemax == lemax) { if (leRange < 255) // Fits in scaled biased byte? return new C1SChunk(bufX(lemin, xmin, C1SChunk._OFF, 0), lemin, PrettyPrint.pow10(xmin)); if (leRange < 65535) { // we use signed 2B short, add -32k to the bias! long bias = 32767 + lemin; return new C2SChunk(bufX(bias, xmin, C2SChunk._OFF, 1), bias, PrettyPrint.pow10(xmin)); } } if (leRange < 4294967295l) { long bias = 2147483647l + lemin; return new C4SChunk(bufX(bias, xmin, C4SChunk._OFF, 2), bias, PrettyPrint.pow10(xmin)); } return chunkD(); } // else an integer column // Compress column into a byte if (xmin == 0 && 0 <= lemin && lemax <= 255 && ((_naCnt + _catCnt) == 0)) return new C1NChunk(bufX(0, 0, C1NChunk._OFF, 0)); if (lemin < Integer.MIN_VALUE) return new C8Chunk(bufX(0, 0, 0, 3)); if (leRange < 255) { // Span fits in a byte? if (0 <= min && max < 255) // Span fits in an unbiased byte? return new C1Chunk(bufX(0, 0, C1Chunk._OFF, 0)); return new C1SChunk(bufX(lemin, xmin, C1SChunk._OFF, 0), lemin, PrettyPrint.pow10i(xmin)); } // Compress column into a short if (leRange < 65535) { // Span fits in a biased short? if (xmin == 0 && Short.MIN_VALUE < lemin && lemax <= Short.MAX_VALUE) // Span fits in an unbiased short? return new C2Chunk(bufX(0, 0, C2Chunk._OFF, 1)); long bias = (lemin - (Short.MIN_VALUE + 1)); return new C2SChunk(bufX(bias, xmin, C2SChunk._OFF, 1), bias, PrettyPrint.pow10i(xmin)); } // Compress column into ints if (Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE) return new C4Chunk(bufX(0, 0, 0, 2)); return new C8Chunk(bufX(0, 0, 0, 3)); }