Esempio n. 1
0
 /** Creates the second pass dparse task from a first phase one. */
 public static DParseTask createPassTwo(DParseTask phaseOneTask) {
   DParseTask t = new DParseTask(phaseOneTask);
   // create new data for phase two
   t._colDomains = new String[t._ncolumns][];
   t._bases = new int[t._ncolumns];
   t._phase = Pass.TWO;
   // calculate the column domains
   for (int i = 0; i < t._colTypes.length; ++i) {
     if (t._colTypes[i] == ECOL && t._enums[i] != null && !t._enums[i].isKilled())
       t._colDomains[i] = t._enums[i].computeColumnDomain();
     else t._enums[i] = null;
   }
   t.calculateColumnEncodings();
   return t;
 }
Esempio n. 2
0
 // Compute a compressed integer buffer
 private byte[] bufX(long bias, int scale, int off, int log) {
   if (_len2 != _len) cancel_sparse();
   byte[] bs = new byte[(_len2 << log) + off];
   for (int i = 0; i < _len; i++) {
     if (isNA(i)) {
       switch (log) {
         case 0:
           bs[i + off] = (byte) (C1Chunk._NA);
           break;
         case 1:
           UDP.set2(bs, (i << 1) + off, (short) C2Chunk._NA);
           break;
         case 2:
           UDP.set4(bs, (i << 2) + off, (int) C4Chunk._NA);
           break;
         case 3:
           UDP.set8(bs, (i << 3) + off, C8Chunk._NA);
           break;
         default:
           H2O.fail();
       }
     } else {
       int x = (_xs[i] == Integer.MIN_VALUE + 1 ? 0 : _xs[i]) - scale;
       long le = x >= 0 ? _ls[i] * DParseTask.pow10i(x) : _ls[i] / DParseTask.pow10i(-x);
       le -= bias;
       switch (log) {
         case 0:
           bs[i + off] = (byte) le;
           break;
         case 1:
           UDP.set2(bs, (i << 1) + off, (short) le);
           break;
         case 2:
           UDP.set4(bs, (i << 2) + off, (int) le);
           break;
         case 3:
           UDP.set8(bs, (i << 3) + off, le);
           break;
         default:
           H2O.fail();
       }
     }
   }
   return bs;
 }
Esempio n. 3
0
 // Compute a compressed double buffer
 private Chunk chunkD() {
   assert _len2 == _len;
   final byte[] bs = MemoryManager.malloc1(_len * 8);
   for (int i = 0; i < _len; ++i)
     UDP.set8d(
         bs,
         8 * i,
         _ds != null
             ? _ds[i]
             : (isNA(i) || isEnum(i)) ? Double.NaN : _ls[i] * DParseTask.pow10(_xs[i]));
   return new C8DChunk(bs);
 }
Esempio n. 4
0
 @Override
 NewChunk inflate_impl(NewChunk nc) {
   double dx = Math.log10(_scale);
   assert DParseTask.fitsIntoInt(dx);
   Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int) dx);
   nc._ls = MemoryManager.malloc8(_len);
   for (int i = 0; i < _len; i++) {
     int res = UDP.get2(_mem, (i << 1) + OFF);
     if (res == C2Chunk._NA) nc.setNA_impl2(i);
     else nc._ls[i] = res + _bias;
   }
   return nc;
 }
Esempio n. 5
0
 @Override
 public long at8_impl(int i) {
   if (_len2 != _len) throw H2O.unimpl();
   if (_ls == null) return (long) _ds[i];
   return _ls[i] * DParseTask.pow10i(_xs[i]);
 }
Esempio n. 6
0
  Chunk compress() {
    // Check for basic mode info: all missing or all strings or mixed stuff
    byte mode = type();
    if (mode == AppendableVec.NA) // ALL NAs, nothing to do
    return new C0DChunk(Double.NaN, _len);
    for (int i = 0; i < _len; i++)
      if (mode == AppendableVec.ENUM && !isEnum(i) || mode == AppendableVec.NUMBER && isEnum(i))
        setNA_impl(i);
    _naCnt = -1;
    type(); // Re-run rollups after dropping all numbers/enums

    // If the data was set8 as doubles, we do a quick check to see if it's
    // plain longs.  If not, we give up and use doubles.
    if (_ds != null) {
      int i = 0;
      for (; i < _len; i++) // Attempt to inject all doubles into longs
      if (!Double.isNaN(_ds[i]) && (double) (long) _ds[i] != _ds[i]) break;
      if (i < _len) return chunkD();
      _ls = new long[_ds.length]; // Else flip to longs
      _xs = new int[_ds.length];
      for (i = 0; i < _len; i++) // Inject all doubles into longs
      if (Double.isNaN(_ds[i])) _xs[i] = Integer.MIN_VALUE;
        else _ls[i] = (long) _ds[i];
      _ds = null;
    }

    // IF (_len2 > _len) THEN Sparse
    // Check for compressed *during appends*.  Here we know:
    // - No specials; _xs[]==0.
    // - No floats; _ds==null
    // - NZ length in _len, actual length in _len2.
    // - Huge ratio between _len2 and _len, and we do NOT want to inflate to
    //   the larger size; we need to keep it all small all the time.
    // - Rows in _xs

    // Data in some fixed-point format, not doubles
    // See if we can sanely normalize all the data to the same fixed-point.
    int xmin = Integer.MAX_VALUE; // min exponent found
    long lemin = 0, lemax = lemin; // min/max at xmin fixed-point
    boolean overflow = false;
    boolean floatOverflow = false;
    boolean first = true;
    double min = _len2 == _len ? Double.MAX_VALUE : 0;
    double max = _len2 == _len ? -Double.MAX_VALUE : 0;

    for (int i = 0; i < _len; i++) {
      if (isNA(i)) continue;
      long l = _ls[i];
      int x = _xs[i];
      if (x == Integer.MIN_VALUE + 1 || _len2 != _len) x = 0; // Replace enum flag with no scaling
      assert l != 0 || x == 0; // Exponent of zero is always zero
      // Compute per-chunk min/max
      double d = l * DParseTask.pow10(x);
      if (d < min) min = d;
      if (d > max) max = d;
      long t; // Remove extra scaling
      while (l != 0 && (t = l / 10) * 10 == l) {
        l = t;
        x++;
      }
      floatOverflow = Math.abs(l) > MAX_FLOAT_MANTISSA;
      if (first) {
        first = false;
        xmin = x;
        lemin = lemax = l;
        continue;
      }
      // Remove any trailing zeros / powers-of-10
      if (overflow || (overflow = (Math.abs(xmin - x)) >= 10)) continue;
      // Track largest/smallest values at xmin scale.  Note overflow.
      if (x < xmin) {
        lemin *= DParseTask.pow10i(xmin - x);
        lemax *= DParseTask.pow10i(xmin - x);
        xmin = x; // Smaller xmin
      }
      // *this* value, as a long scaled at the smallest scale
      long le = l * DParseTask.pow10i(x - xmin);
      if (le < lemin) lemin = le;
      if (le > lemax) lemax = le;
    }
    if (_len2 != _len) { // sparse? compare xmin/lemin/lemax with 0
      lemin = Math.min(0, lemin);
      lemax = Math.max(0, lemax);
    }

    // Constant column?
    if (_naCnt == 0 && min == max) {
      return ((long) min == min) ? new C0LChunk((long) min, _len2) : new C0DChunk(min, _len2);
    }

    // Boolean column?
    if (max == 1 && min == 0 && xmin == 0) {
      if (_nzCnt * 32 < _len2
          && _naCnt == 0
          && _len2 < 65535
          && xmin == 0) // Very sparse? (and not too big?)
      if (_len2 == _len) return new CX0Chunk(_ls, _len2, _nzCnt); // Dense  constructor
        else return new CX0Chunk(_xs, _len2, _len); // Sparse constructor
      int bpv = _strCnt + _naCnt > 0 ? 2 : 1; // Bit-vector
      byte[] cbuf = bufB(bpv);
      return new CBSChunk(cbuf, cbuf[0], cbuf[1]);
    }

    final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE;

    // Result column must hold floats?
    // Highly sparse but not a bitvector or constant?
    if (!fpoint
        && (_nzCnt + _naCnt) * 8 < _len2
        && _len2 < 65535
        && xmin == 0
        && // (and not too big?)
        lemin > Short.MIN_VALUE
        && lemax <= Short.MAX_VALUE) // Only handling unbiased shorts here
    if (_len2 == _len) return new CX2Chunk(_ls, _xs, _len2, _nzCnt, _naCnt); // Sparse byte chunk
      else return new CX2Chunk(_ls, _xs, _len2, _len);

    // Exponent scaling: replacing numbers like 1.3 with 13e-1.  '13' fits in a
    // byte and we scale the column by 0.1.  A set of numbers like
    // {1.2,23,0.34} then is normalized to always be represented with 2 digits
    // to the right: {1.20,23.00,0.34} and we scale by 100: {120,2300,34}.
    // This set fits in a 2-byte short.

    // We use exponent-scaling for bytes & shorts only; it's uncommon (and not
    // worth it) for larger numbers.  We need to get the exponents to be
    // uniform, so we scale up the largest lmax by the largest scale we need
    // and if that fits in a byte/short - then it's worth compressing.  Other
    // wise we just flip to a float or double representation.
    if (overflow || fpoint && floatOverflow || -35 > xmin || xmin > 35) return chunkD();
    if (fpoint) {
      if (lemax - lemin < 255) // Fits in scaled biased byte?
      return new C1SChunk(bufX(lemin, xmin, C1SChunk.OFF, 0), (int) lemin, DParseTask.pow10(xmin));
      if (lemax - lemin < 65535) { // we use signed 2B short, add -32k to the bias!
        long bias = 32767 + lemin;
        return new C2SChunk(bufX(bias, xmin, C2SChunk.OFF, 1), (int) bias, DParseTask.pow10(xmin));
      }
      if (lemax - lemin < Integer.MAX_VALUE)
        return new C4SChunk(
            bufX(lemin, xmin, C4SChunk.OFF, 2), (int) lemin, DParseTask.pow10(xmin));
      return chunkD();
    } // else an integer column
    // Compress column into a byte
    if (xmin == 0 && 0 <= lemin && lemax <= 255 && ((_naCnt + _strCnt) == 0))
      return new C1NChunk(bufX(0, 0, C1NChunk.OFF, 0));
    if (lemax - lemin < 255) { // Span fits in a byte?
      if (0 <= min && max < 255) // Span fits in an unbiased byte?
      return new C1Chunk(bufX(0, 0, C1Chunk.OFF, 0));
      return new C1SChunk(bufX(lemin, xmin, C1SChunk.OFF, 0), (int) lemin, DParseTask.pow10i(xmin));
    }

    // Compress column into a short
    if (lemax - lemin < 65535) { // Span fits in a biased short?
      if (xmin == 0
          && Short.MIN_VALUE < lemin
          && lemax <= Short.MAX_VALUE) // Span fits in an unbiased short?
      return new C2Chunk(bufX(0, 0, C2Chunk.OFF, 1));
      int bias = (int) (lemin - (Short.MIN_VALUE + 1));
      return new C2SChunk(bufX(bias, xmin, C2SChunk.OFF, 1), bias, DParseTask.pow10i(xmin));
    }
    // Compress column into ints
    if (Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE) return new C4Chunk(bufX(0, 0, 0, 2));
    return new C8Chunk(bufX(0, 0, 0, 3));
  }
Esempio n. 7
0
  @Override
  public void processRecord(Record record) {
    int curCol = -1;
    double curNum = Double.NaN;
    ValueString curStr = null;

    switch (record.getSid()) {
      case BoundSheetRecord.sid:
      case BOFRecord.sid:
        // we just run together multiple sheets
        break;
      case SSTRecord.sid:
        _sstRecord = (SSTRecord) record;
        break;
      case BlankRecord.sid:
        BlankRecord brec = (BlankRecord) record;

        curCol = brec.getColumn();
        curStr = _str.setTo("");
        break;
      case BoolErrRecord.sid:
        BoolErrRecord berec = (BoolErrRecord) record;

        curCol = berec.getColumn();
        curStr = _str.setTo("");
        break;

      case FormulaRecord.sid:
        FormulaRecord frec = (FormulaRecord) record;

        curCol = frec.getColumn();
        curNum = frec.getValue();

        if (Double.isNaN(curNum)) {
          // Formula result is a string
          // This is stored in the next record
          _outputNextStringRecord = true;
          _nextCol = frec.getColumn();
        }
        break;
      case StringRecord.sid:
        if (_outputNextStringRecord) {
          // String for formula
          StringRecord srec = (StringRecord) record;
          curStr = _str.setTo(srec.getString());
          curCol = _nextCol;
          _outputNextStringRecord = false;
        }
        break;
      case LabelRecord.sid:
        LabelRecord lrec = (LabelRecord) record;

        curCol = lrec.getColumn();
        curStr = _str.setTo(lrec.getValue());
        break;
      case LabelSSTRecord.sid:
        LabelSSTRecord lsrec = (LabelSSTRecord) record;
        if (_sstRecord == null) {
          System.err.println("[ExcelParser] Missing SST record");
        } else {
          curCol = lsrec.getColumn();
          curStr = _str.setTo(_sstRecord.getString(lsrec.getSSTIndex()).toString());
        }
        break;
      case NoteRecord.sid:
        System.err.println("[ExcelParser] Warning cell notes are unsupported");
        break;
      case NumberRecord.sid:
        NumberRecord numrec = (NumberRecord) record;
        curCol = numrec.getColumn();
        curNum = numrec.getValue();
        break;
      case RKRecord.sid:
        System.err.println("[ExcelParser] Warning RK records are unsupported");
        break;
      default:
        break;
    }

    // Handle missing column
    if (record instanceof MissingCellDummyRecord) {
      MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
      curCol = mc.getColumn();
      curNum = Double.NaN;
    }

    // Handle end of row
    if (record instanceof LastCellOfRowDummyRecord) {
      if (_firstRow) {
        _firstRow = false;
        String[] arr = new String[_columnNames.size()];
        arr = _columnNames.toArray(arr);
        _callback.setColumnNames(arr);
      }
      _callback.newLine();
    }

    if (curCol == -1) return;

    if (_firstRow) {
      _columnNames.add(curStr == null ? "" : curStr.toString());
    } else {
      if (curStr == null)
        if (Double.isNaN(curNum)) _callback.addInvalidCol(curCol);
        else _callback.addCol(curCol, curNum);
      else _callback.addStrCol(curCol, curStr);
    }
  }