/** Creates the second pass dparse task from a first phase one. */ public static DParseTask createPassTwo(DParseTask phaseOneTask) { DParseTask t = new DParseTask(phaseOneTask); // create new data for phase two t._colDomains = new String[t._ncolumns][]; t._bases = new int[t._ncolumns]; t._phase = Pass.TWO; // calculate the column domains for (int i = 0; i < t._colTypes.length; ++i) { if (t._colTypes[i] == ECOL && t._enums[i] != null && !t._enums[i].isKilled()) t._colDomains[i] = t._enums[i].computeColumnDomain(); else t._enums[i] = null; } t.calculateColumnEncodings(); return t; }
// Compute a compressed integer buffer private byte[] bufX(long bias, int scale, int off, int log) { if (_len2 != _len) cancel_sparse(); byte[] bs = new byte[(_len2 << log) + off]; for (int i = 0; i < _len; i++) { if (isNA(i)) { switch (log) { case 0: bs[i + off] = (byte) (C1Chunk._NA); break; case 1: UDP.set2(bs, (i << 1) + off, (short) C2Chunk._NA); break; case 2: UDP.set4(bs, (i << 2) + off, (int) C4Chunk._NA); break; case 3: UDP.set8(bs, (i << 3) + off, C8Chunk._NA); break; default: H2O.fail(); } } else { int x = (_xs[i] == Integer.MIN_VALUE + 1 ? 0 : _xs[i]) - scale; long le = x >= 0 ? _ls[i] * DParseTask.pow10i(x) : _ls[i] / DParseTask.pow10i(-x); le -= bias; switch (log) { case 0: bs[i + off] = (byte) le; break; case 1: UDP.set2(bs, (i << 1) + off, (short) le); break; case 2: UDP.set4(bs, (i << 2) + off, (int) le); break; case 3: UDP.set8(bs, (i << 3) + off, le); break; default: H2O.fail(); } } } return bs; }
// Compute a compressed double buffer private Chunk chunkD() { assert _len2 == _len; final byte[] bs = MemoryManager.malloc1(_len * 8); for (int i = 0; i < _len; ++i) UDP.set8d( bs, 8 * i, _ds != null ? _ds[i] : (isNA(i) || isEnum(i)) ? Double.NaN : _ls[i] * DParseTask.pow10(_xs[i])); return new C8DChunk(bs); }
@Override NewChunk inflate_impl(NewChunk nc) { double dx = Math.log10(_scale); assert DParseTask.fitsIntoInt(dx); Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int) dx); nc._ls = MemoryManager.malloc8(_len); for (int i = 0; i < _len; i++) { int res = UDP.get2(_mem, (i << 1) + OFF); if (res == C2Chunk._NA) nc.setNA_impl2(i); else nc._ls[i] = res + _bias; } return nc; }
@Override public long at8_impl(int i) { if (_len2 != _len) throw H2O.unimpl(); if (_ls == null) return (long) _ds[i]; return _ls[i] * DParseTask.pow10i(_xs[i]); }
Chunk compress() { // Check for basic mode info: all missing or all strings or mixed stuff byte mode = type(); if (mode == AppendableVec.NA) // ALL NAs, nothing to do return new C0DChunk(Double.NaN, _len); for (int i = 0; i < _len; i++) if (mode == AppendableVec.ENUM && !isEnum(i) || mode == AppendableVec.NUMBER && isEnum(i)) setNA_impl(i); _naCnt = -1; type(); // Re-run rollups after dropping all numbers/enums // If the data was set8 as doubles, we do a quick check to see if it's // plain longs. If not, we give up and use doubles. if (_ds != null) { int i = 0; for (; i < _len; i++) // Attempt to inject all doubles into longs if (!Double.isNaN(_ds[i]) && (double) (long) _ds[i] != _ds[i]) break; if (i < _len) return chunkD(); _ls = new long[_ds.length]; // Else flip to longs _xs = new int[_ds.length]; for (i = 0; i < _len; i++) // Inject all doubles into longs if (Double.isNaN(_ds[i])) _xs[i] = Integer.MIN_VALUE; else _ls[i] = (long) _ds[i]; _ds = null; } // IF (_len2 > _len) THEN Sparse // Check for compressed *during appends*. Here we know: // - No specials; _xs[]==0. // - No floats; _ds==null // - NZ length in _len, actual length in _len2. // - Huge ratio between _len2 and _len, and we do NOT want to inflate to // the larger size; we need to keep it all small all the time. // - Rows in _xs // Data in some fixed-point format, not doubles // See if we can sanely normalize all the data to the same fixed-point. int xmin = Integer.MAX_VALUE; // min exponent found long lemin = 0, lemax = lemin; // min/max at xmin fixed-point boolean overflow = false; boolean floatOverflow = false; boolean first = true; double min = _len2 == _len ? Double.MAX_VALUE : 0; double max = _len2 == _len ? -Double.MAX_VALUE : 0; for (int i = 0; i < _len; i++) { if (isNA(i)) continue; long l = _ls[i]; int x = _xs[i]; if (x == Integer.MIN_VALUE + 1 || _len2 != _len) x = 0; // Replace enum flag with no scaling assert l != 0 || x == 0; // Exponent of zero is always zero // Compute per-chunk min/max double d = l * DParseTask.pow10(x); if (d < min) min = d; if (d > max) max = d; long t; // Remove extra scaling while (l != 0 && (t = l / 10) * 10 == l) { l = t; x++; } floatOverflow = Math.abs(l) > MAX_FLOAT_MANTISSA; if (first) { first = false; xmin = x; lemin = lemax = l; continue; } // Remove any trailing zeros / powers-of-10 if (overflow || (overflow = (Math.abs(xmin - x)) >= 10)) continue; // Track largest/smallest values at xmin scale. Note overflow. if (x < xmin) { lemin *= DParseTask.pow10i(xmin - x); lemax *= DParseTask.pow10i(xmin - x); xmin = x; // Smaller xmin } // *this* value, as a long scaled at the smallest scale long le = l * DParseTask.pow10i(x - xmin); if (le < lemin) lemin = le; if (le > lemax) lemax = le; } if (_len2 != _len) { // sparse? compare xmin/lemin/lemax with 0 lemin = Math.min(0, lemin); lemax = Math.max(0, lemax); } // Constant column? if (_naCnt == 0 && min == max) { return ((long) min == min) ? new C0LChunk((long) min, _len2) : new C0DChunk(min, _len2); } // Boolean column? if (max == 1 && min == 0 && xmin == 0) { if (_nzCnt * 32 < _len2 && _naCnt == 0 && _len2 < 65535 && xmin == 0) // Very sparse? (and not too big?) if (_len2 == _len) return new CX0Chunk(_ls, _len2, _nzCnt); // Dense constructor else return new CX0Chunk(_xs, _len2, _len); // Sparse constructor int bpv = _strCnt + _naCnt > 0 ? 2 : 1; // Bit-vector byte[] cbuf = bufB(bpv); return new CBSChunk(cbuf, cbuf[0], cbuf[1]); } final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; // Result column must hold floats? // Highly sparse but not a bitvector or constant? if (!fpoint && (_nzCnt + _naCnt) * 8 < _len2 && _len2 < 65535 && xmin == 0 && // (and not too big?) lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE) // Only handling unbiased shorts here if (_len2 == _len) return new CX2Chunk(_ls, _xs, _len2, _nzCnt, _naCnt); // Sparse byte chunk else return new CX2Chunk(_ls, _xs, _len2, _len); // Exponent scaling: replacing numbers like 1.3 with 13e-1. '13' fits in a // byte and we scale the column by 0.1. A set of numbers like // {1.2,23,0.34} then is normalized to always be represented with 2 digits // to the right: {1.20,23.00,0.34} and we scale by 100: {120,2300,34}. // This set fits in a 2-byte short. // We use exponent-scaling for bytes & shorts only; it's uncommon (and not // worth it) for larger numbers. We need to get the exponents to be // uniform, so we scale up the largest lmax by the largest scale we need // and if that fits in a byte/short - then it's worth compressing. Other // wise we just flip to a float or double representation. if (overflow || fpoint && floatOverflow || -35 > xmin || xmin > 35) return chunkD(); if (fpoint) { if (lemax - lemin < 255) // Fits in scaled biased byte? return new C1SChunk(bufX(lemin, xmin, C1SChunk.OFF, 0), (int) lemin, DParseTask.pow10(xmin)); if (lemax - lemin < 65535) { // we use signed 2B short, add -32k to the bias! long bias = 32767 + lemin; return new C2SChunk(bufX(bias, xmin, C2SChunk.OFF, 1), (int) bias, DParseTask.pow10(xmin)); } if (lemax - lemin < Integer.MAX_VALUE) return new C4SChunk( bufX(lemin, xmin, C4SChunk.OFF, 2), (int) lemin, DParseTask.pow10(xmin)); return chunkD(); } // else an integer column // Compress column into a byte if (xmin == 0 && 0 <= lemin && lemax <= 255 && ((_naCnt + _strCnt) == 0)) return new C1NChunk(bufX(0, 0, C1NChunk.OFF, 0)); if (lemax - lemin < 255) { // Span fits in a byte? if (0 <= min && max < 255) // Span fits in an unbiased byte? return new C1Chunk(bufX(0, 0, C1Chunk.OFF, 0)); return new C1SChunk(bufX(lemin, xmin, C1SChunk.OFF, 0), (int) lemin, DParseTask.pow10i(xmin)); } // Compress column into a short if (lemax - lemin < 65535) { // Span fits in a biased short? if (xmin == 0 && Short.MIN_VALUE < lemin && lemax <= Short.MAX_VALUE) // Span fits in an unbiased short? return new C2Chunk(bufX(0, 0, C2Chunk.OFF, 1)); int bias = (int) (lemin - (Short.MIN_VALUE + 1)); return new C2SChunk(bufX(bias, xmin, C2SChunk.OFF, 1), bias, DParseTask.pow10i(xmin)); } // Compress column into ints if (Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE) return new C4Chunk(bufX(0, 0, 0, 2)); return new C8Chunk(bufX(0, 0, 0, 3)); }
@Override public void processRecord(Record record) { int curCol = -1; double curNum = Double.NaN; ValueString curStr = null; switch (record.getSid()) { case BoundSheetRecord.sid: case BOFRecord.sid: // we just run together multiple sheets break; case SSTRecord.sid: _sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; curCol = brec.getColumn(); curStr = _str.setTo(""); break; case BoolErrRecord.sid: BoolErrRecord berec = (BoolErrRecord) record; curCol = berec.getColumn(); curStr = _str.setTo(""); break; case FormulaRecord.sid: FormulaRecord frec = (FormulaRecord) record; curCol = frec.getColumn(); curNum = frec.getValue(); if (Double.isNaN(curNum)) { // Formula result is a string // This is stored in the next record _outputNextStringRecord = true; _nextCol = frec.getColumn(); } break; case StringRecord.sid: if (_outputNextStringRecord) { // String for formula StringRecord srec = (StringRecord) record; curStr = _str.setTo(srec.getString()); curCol = _nextCol; _outputNextStringRecord = false; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; curCol = lrec.getColumn(); curStr = _str.setTo(lrec.getValue()); break; case LabelSSTRecord.sid: LabelSSTRecord lsrec = (LabelSSTRecord) record; if (_sstRecord == null) { System.err.println("[ExcelParser] Missing SST record"); } else { curCol = lsrec.getColumn(); curStr = _str.setTo(_sstRecord.getString(lsrec.getSSTIndex()).toString()); } break; case NoteRecord.sid: System.err.println("[ExcelParser] Warning cell notes are unsupported"); break; case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; curCol = numrec.getColumn(); curNum = numrec.getValue(); break; case RKRecord.sid: System.err.println("[ExcelParser] Warning RK records are unsupported"); break; default: break; } // Handle missing column if (record instanceof MissingCellDummyRecord) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; curCol = mc.getColumn(); curNum = Double.NaN; } // Handle end of row if (record instanceof LastCellOfRowDummyRecord) { if (_firstRow) { _firstRow = false; String[] arr = new String[_columnNames.size()]; arr = _columnNames.toArray(arr); _callback.setColumnNames(arr); } _callback.newLine(); } if (curCol == -1) return; if (_firstRow) { _columnNames.add(curStr == null ? "" : curStr.toString()); } else { if (curStr == null) if (Double.isNaN(curNum)) _callback.addInvalidCol(curCol); else _callback.addCol(curCol, curNum); else _callback.addStrCol(curCol, curStr); } }