private static void addFolder2( FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) { try { if (fs == null) return; Futures futures = new Futures(); for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder2(fs, pfs, keys, failed); } else { long size = file.getLen(); Key res; if (pfs.getName().endsWith(Extensions.JSON)) { throw H2O.unimpl(); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? throw H2O.unimpl(); } else { Key k = null; keys.add((k = HdfsFileVec.make(file, futures)).toString()); Log.info("PersistHdfs: DKV.put(" + k + ")"); } } } } catch (Exception e) { Log.err(e); failed.add(p.toString()); } }
@Override public void map(Chunk chks[], NewChunk nchks[]) { long rstart = chks[0]._start; int rlen = chks[0]._len; // Total row count int rx = 0; // Which row to in/ex-clude int rlo = 0; // Lo/Hi for this block of rows int rhi = rlen; while (true) { // Still got rows to include? if (_rows != null) { // Got a row selector? if (rx >= _rows.length) break; // All done with row selections long r = _rows[rx++] - 1; // Next row selector if (r < 0) { // Row exclusion if (rx > 0 && _rows[rx - 1] < _rows[rx]) throw H2O.unimpl(); long er = Math.abs(r) - 2; if (er < rstart) continue; // scoop up all of the rows before the first exclusion if (rx == 1 && ((int) (er + 1 - rstart)) > 0 && _ex) { rlo = (int) rstart; rhi = (int) (er - rstart); _ex = false; rx--; } else { rlo = (int) (er + 1 - rstart); // TODO: handle jumbled row indices ( e.g. -c(1,5,3) ) while (rx < _rows.length && (_rows[rx] + 1 == _rows[rx - 1] && rlo < rlen)) { if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl(); rx++; rlo++; // Exclude consecutive rows } rhi = rx >= _rows.length ? rlen : (int) Math.abs(_rows[rx] - 1) - 2; if (rx < _rows.length - 1 && _rows[rx] < _rows[rx + 1]) throw H2O.unimpl(); } } else { // Positive row list? if (r < rstart) continue; rlo = (int) (r - rstart); rhi = rlo + 1; // Stop at the next row while (rx < _rows.length && (_rows[rx] - 1 - rstart) == rhi && rhi < rlen) { rx++; rhi++; // Grab sequential rows } } } // Process this next set of rows // For all cols in the new set for (int i = 0; i < _cols.length; i++) { Chunk oc = chks[_cols[i]]; NewChunk nc = nchks[i]; if (oc._vec.isInt()) { // Slice on integer columns for (int j = rlo; j < rhi; j++) if (oc.isNA0(j)) nc.addNA(); else nc.addNum(oc.at80(j), 0); } else { // Slice on double columns for (int j = rlo; j < rhi; j++) nc.addNum(oc.at0(j)); } } rlo = rhi; if (_rows == null) break; } }
// Set & At on NewChunks are weird: only used after inflating some other // chunk. At this point the NewChunk is full size, no more appends allowed, // and the xs exponent array should be only full of zeros. Accesses must be // in-range and refer to the inflated values of the original Chunk. @Override boolean set_impl(int i, long l) { if (_ds != null) throw H2O.unimpl(); if (_len2 != _len) throw H2O.unimpl(); _ls[i] = l; _xs[i] = 0; return true; }
@Override Val apply(Env env, Env.StackHelp stk, AST asts[]) { // Compute the variable args. Find the common row count Val vals[] = new Val[asts.length]; Vec vec = null; for (int i = 1; i < asts.length; i++) { vals[i] = stk.track(asts[i].exec(env)); if (vals[i].isFrame()) { Vec anyvec = vals[i].getFrame().anyVec(); if (anyvec == null) continue; // Ignore the empty frame if (vec == null) vec = anyvec; else if (vec.length() != anyvec.length()) throw new IllegalArgumentException( "cbind frames must have all the same rows, found " + vec.length() + " and " + anyvec.length() + " rows."); } } boolean clean = false; if (vec == null) { vec = Vec.makeZero(1); clean = true; } // Default to length 1 // Populate the new Frame Frame fr = new Frame(); for (int i = 1; i < asts.length; i++) { switch (vals[i].type()) { case Val.FRM: fr.add(fr.makeCompatible(vals[i].getFrame())); break; case Val.FUN: throw H2O.unimpl(); case Val.STR: throw H2O.unimpl(); case Val.NUM: // Auto-expand scalars to fill every row double d = vals[i].getNum(); fr.add(Double.toString(d), vec.makeCon(d)); break; default: throw H2O.unimpl(); } } if (clean) vec.remove(); return new ValFrame(fr); }
@Override public double atd_impl(int i) { if (_len2 != _len) throw H2O.unimpl(); if (_ds == null) return at8_impl(i); assert _xs == null; return _ds[i]; }
// ------------------------------------------------------------------------ // Zipped file; no parallel decompression; decompress into local chunks, // parse local chunks; distribute chunks later. ParseWriter streamParseZip(final InputStream is, final StreamParseWriter dout, InputStream bvs) throws IOException { // All output into a fresh pile of NewChunks, one per column if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl(); StreamData din = new StreamData(is); int cidx = 0; StreamParseWriter nextChunk = dout; int zidx = bvs.read(null, 0, 0); // Back-channel read of chunk index assert zidx == 1; while (is.available() > 0) { int xidx = bvs.read(null, 0, 0); // Back-channel read of chunk index if (xidx > zidx) { // Advanced chunk index of underlying ByteVec stream? zidx = xidx; // Record advancing of chunk nextChunk.close(); // Match output chunks to input zipfile chunks if (dout != nextChunk) { dout.reduce(nextChunk); if (_jobKey != null && ((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed()) break; } nextChunk = nextChunk.nextChunk(); } parseChunk(cidx++, din, nextChunk); } parseChunk(cidx, din, nextChunk); // Parse the remaining partial 32K buffer nextChunk.close(); if (dout != nextChunk) dout.reduce(nextChunk); return dout; }
public final double linkDeriv(double x) { // note: compute an inverse of what R does switch (_link) { case logit: // case multinomial: double div = (x * (1 - x)); if (div < 1e-6) return 1e6; // avoid numerical instability return 1.0 / div; case identity: return 1; case log: return 1.0 / x; case inverse: return -1.0 / (x * x); case tweedie: // double res = _tweedie_link_power == 0 // ?Math.max(2e-16,Math.exp(x)) // // (1/lambda) * eta^(1/lambda - 1) // :(1.0/_tweedie_link_power) * Math.pow(link(x), 1.0/_tweedie_link_power - // 1.0); return _tweedie_link_power == 0 ? 1.0 / Math.max(2e-16, x) : _tweedie_link_power * Math.pow(x, _tweedie_link_power - 1); default: throw H2O.unimpl(); } }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); switch (t) { case STANDARDIZE: normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[i] = v.mean(); break; case NORMALIZE: normMul[i] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; normSub[i] = v.mean(); break; case DEMEAN: normMul[i] = 1; normSub[i] = v.mean(); break; case DESCALE: normMul[i] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[i] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[i]); assert !Double.isNaN(normSub[i]); } }
private void setTransform( TransformType t, double[] normMul, double[] normSub, int vecStart, int n) { int idx = 0; // idx!=i when interactions are in play, otherwise, it's just 'i' for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); boolean isIWV = isInteractionVec(vecStart + i); switch (t) { case STANDARDIZE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case NORMALIZE: normMul[idx] = (v.max() - v.min() > 0) ? 1.0 / (v.max() - v.min()) : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DEMEAN: normMul[idx] = 1; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = v.mean(); break; case DESCALE: normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; if (isIWV) for (int j = idx + 1; j < nextNumericIdx(i) + idx; j++) normMul[j] = 1; normSub[idx] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[idx]); assert !Double.isNaN(normSub[idx]); idx = isIWV ? (idx + nextNumericIdx(i)) : (idx + 1); } }
// TODO: Constant response shouldn't be regression. Need to override getModelCategory() @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { switch (_output.getModelCategory()) { case Binomial: return new ModelMetricsBinomial.MetricBuilderBinomial(domain); case Multinomial: return new ModelMetricsMultinomial.MetricBuilderMultinomial(domain.length, domain); default: throw H2O.unimpl(); } }
// check if n is in this list of numbers // NB: all contiguous ranges have already been checked to have stride 1 boolean has(long v) { assert _isSort; // Only called when already sorted // do something special for negative indexing... that does not involve // allocating arrays, once per list element! if (v < 0) throw H2O.unimpl(); int idx = Arrays.binarySearch(_bases, v); if (idx >= 0) return true; idx = -idx - 2; // See Arrays.binarySearch; returns (-idx-1), we want +idx-1 assert _bases[idx] < v; // Sanity check binary search, AND idx >= 0 return v < _bases[idx] + _cnts[idx] * _strides[idx]; }
ParseWriter streamParse(final InputStream is, final ParseWriter dout) throws IOException { if (!_setup._parse_type._parallelParseSupported) throw H2O.unimpl(); StreamData din = new StreamData(is); int cidx = 0; // FIXME leaving _jobKey == null until sampling is done, this mean entire zip files // FIXME are parsed for parseSetup while (is.available() > 0 && (_jobKey == null || !((Job) DKV.getGet(_jobKey)).isCancelledOrCrashed())) parseChunk(cidx++, din, dout); parseChunk(cidx, din, dout); // Parse the remaining partial 32K buffer return dout; }
@Override boolean setNA_impl(int i) { if (isNA(i)) return true; if (_len2 != _len) throw H2O.unimpl(); if (_ls != null) { _ls[i] = 0; _xs[i] = Integer.MIN_VALUE; } if (_ds != null) { _ds[i] = Double.NaN; } return true; }
public boolean isSigmaScaled() { switch (this) { case NONE: case DEMEAN: case NORMALIZE: return false; case STANDARDIZE: case DESCALE: return true; default: throw H2O.unimpl(); } }
@Override boolean set_impl(int i, double d) { if (_ls != null) { // Flip to using doubles if (_len2 != _len) throw H2O.unimpl(); double ds[] = MemoryManager.malloc8d(_len); for (int j = 0; j < _len; j++) ds[j] = (isNA(j) || isEnum(j)) ? Double.NaN : _ls[j] * Math.pow(10, _xs[j]); _ds = ds; _ls = null; _xs = null; } _ds[i] = d; return true; }
public final boolean canonical() { switch (_family) { case gaussian: return _link == Link.identity; case binomial: return _link == Link.logit; case poisson: return _link == Link.log; case gamma: return _link == Link.inverse; // case tweedie: // return false; default: throw H2O.unimpl(); } }
public final double linkDeriv(double x) { switch (link) { case logit: return 1 / (x * (1 - x)); case identity: return 1; case log: return 1.0 / x; case inverse: return -1.0 / (x * x); case tweedie: return tweedie_link_power * Math.pow(x, tweedie_link_power - 1); default: throw H2O.unimpl(); } }
public double[] expandCats() { if (isSparse() || _responses > 0) throw H2O.unimpl(); int N = fullN(); int numStart = numStart(); double[] res = new double[N + (_intercept ? 1 : 0)]; for (int i = 0; i < nBins; ++i) res[binIds[i]] = 1; if (numIds == null) { System.arraycopy(numVals, 0, res, numStart, numVals.length); } else { for (int i = 0; i < nNums; ++i) res[numIds[i]] = numVals[i]; } if (_intercept) res[res.length - 1] = 1; return res; }
public final double linkDeriv(double x) { // note: compute an inverse of what R does switch (_link) { case logit: // case multinomial: double div = (x * (1 - x)); if (div < 1e-6) return 1e6; // avoid numerical instability return 1.0 / div; case identity: return 1; case log: return 1.0 / x; case inverse: return -1.0 / (x * x); case tweedie: return _link_power == 0 ? 1.0 / Math.max(2e-16, x) : _link_power * Math.pow(x, _link_power - 1); default: throw H2O.unimpl(); } }
@SuppressWarnings("fallthrough") private void calculateColumnEncodings() { assert (_bases != null); assert (_min != null); for (int i = 0; i < _ncolumns; ++i) { switch (_colTypes[i]) { case UCOL: // only missing values _colTypes[i] = BYTE; break; case ECOL: // enum if (_enums[i] == null || _enums[i].isKilled()) { _max[i] = 0; _min[i] = 0; _colTypes[i] = STRINGCOL; } else { _max[i] = _colDomains[i].length - 1; _min[i] = 0; if (_max[i] < 256) _colTypes[i] = BYTE; else if (_max[i] < 65536) _colTypes[i] = SHORT; else _colTypes[i] = INT; } break; case ICOL: // number if (_max[i] - _min[i] < 255) { _colTypes[i] = BYTE; _bases[i] = (int) _min[i]; } else if ((_max[i] - _min[i]) < 65535) { _colTypes[i] = SHORT; _bases[i] = (int) _min[i]; } else if (_max[i] - _min[i] < (1l << 32)) { _colTypes[i] = INT; _bases[i] = (int) _min[i]; } else _colTypes[i] = LONG; break; case FCOL: case DCOL: if (_scale[i] >= -4 && (_max[i] <= powers10i[powers10i.length - 1]) && (_min[i] >= -powers10i[powers10i.length - 1])) { double s = pow10(-_scale[i]); double range = s * (_max[i] - _min[i]); double base = s * _min[i]; if (range < 256) { if (fitsIntoInt(base)) { // check if base fits into int! _colTypes[i] = DBYTE; _bases[i] = (int) base; break; } } else if (range < 65535) { if (fitsIntoInt(base)) { _colTypes[i] = DSHORT; _bases[i] = (int) (base); break; } } } _scale[i] = 0; _bases[i] = 0; _colTypes[i] = (_colTypes[i] == FCOL) ? FLOAT : DOUBLE; break; case TCOL: // Time; millis since jan 1, 1970 _scale[i] = -1; _bases[i] = 0; _min[i] = 0.0; _max[i] = System.currentTimeMillis(); _colTypes[i] = LONG; break; default: throw H2O.unimpl(); } } }
@Override protected long checksum_impl() { throw H2O.unimpl(); } // don't really need checksum
@Override protected String errStr() { throw H2O.unimpl(); }
@Override public String str() { throw H2O.unimpl(); }
public StringBuilder toJavaStr(StringBuilder sb, String jname) { throw H2O.unimpl(); }
public String unique_name() { throw H2O.unimpl(); }
@Override public StringBuilder toJavaNum(StringBuilder sb, String jname) { throw H2O.unimpl(); }
@Override public long at8_impl(int i) { if (_len2 != _len) throw H2O.unimpl(); if (_ls == null) return (long) _ds[i]; return _ls[i] * DParseTask.pow10i(_xs[i]); }
/** * Convert a collection of hyper-parameters into an array-of-doubles. Missing hyper parms will be * filled in with the default value. Error if the value cannot be represented as a double. * * @param hypers A set of {hyper-parameter field names, values} * @return The same set as a double[] */ private double[] hyper2double(Map<String, Object> hypers) { throw H2O.unimpl(); }
@Override public boolean isNA_impl(int i) { if (_len2 != _len) throw H2O.unimpl(); return isNA(i); }
@Override protected long checksum_impl() { throw H2O.unimpl(); }