private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
static { InputStream resource = Boot._init.getResource2("/page.html"); try { _htmlTemplate = new String(ByteStreams.toByteArray(resource)).replace("%cloud_name", H2O.NAME); } catch (NullPointerException e) { Log.err(e); Log.die("page.html not found in resources."); } catch (Exception e) { Log.err(e); Log.die(e.getMessage()); } finally { Closeables.closeQuietly(resource); } }
@Override public void reduce(DRemoteTask drt) { try { DParseTask other = (DParseTask) drt; if (_sigma == null) _sigma = other._sigma; if (_invalidValues == null) { _enums = other._enums; _min = other._min; _max = other._max; _mean = other._mean; _sigma = other._sigma; _scale = other._scale; _colTypes = other._colTypes; _nrows = other._nrows; _invalidValues = other._invalidValues; } else { if (_phase == Pass.ONE) { if (_nrows != other._nrows) for (int i = 0; i < _nrows.length; ++i) _nrows[i] += other._nrows[i]; for (int i = 0; i < _ncolumns; ++i) { if (_enums[i] != other._enums[i]) _enums[i].merge(other._enums[i]); if (other._min[i] < _min[i]) _min[i] = other._min[i]; if (other._max[i] > _max[i]) _max[i] = other._max[i]; if (other._scale[i] < _scale[i]) _scale[i] = other._scale[i]; if (other._colTypes[i] > _colTypes[i]) _colTypes[i] = other._colTypes[i]; _mean[i] += other._mean[i]; } } else if (_phase == Pass.TWO) { for (int i = 0; i < _ncolumns; ++i) _sigma[i] += other._sigma[i]; } else assert false : "unexpected _phase value:" + _phase; for (int i = 0; i < _ncolumns; ++i) _invalidValues[i] += other._invalidValues[i]; } _myrows += other._myrows; if (_error == null) _error = other._error; else if (other._error != null) _error = _error + "\n" + other._error; } catch (Exception e) { e.printStackTrace(); } }
/** * Map function for distributed parsing of the CSV files. * * <p>In first phase it calculates the min, max, means, encodings and other statistics about the * dataset, determines the number of columns. * * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal * sized chunks. */ @Override public void map(Key key) { try { Key aryKey = null; boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK; boolean skipFirstLine = _skipFirstLine; if (arraylet) { aryKey = ValueArray.getArrayKey(key); _chunkId = ValueArray.getChunkIndex(key); skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0); } switch (_phase) { case ONE: assert (_ncolumns != 0); // initialize the column statistics phaseOneInitialize(); // perform the parse CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p.parse(key); if (arraylet) { long idx = ValueArray.getChunkIndex(key); int idx2 = (int) idx; assert idx2 == idx; assert (_nrows[idx2] == 0) : idx + ": " + Arrays.toString(_nrows) + " (" + _nrows[idx2] + " -- " + _myrows + ")"; _nrows[idx2] = _myrows; } break; case TWO: assert (_ncolumns != 0); // initialize statistics - invalid rows, sigma and row size phaseTwoInitialize(); // calculate the first row and the number of rows to parse int firstRow = 0; int lastRow = _myrows; _myrows = 0; if (arraylet) { long origChunkIdx = ValueArray.getChunkIndex(key); firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1]; lastRow = _nrows[(int) origChunkIdx]; } int rowsToParse = lastRow - firstRow; // create the output streams _outputStreams2 = createRecords(firstRow, rowsToParse); assert (_outputStreams2.length > 0); _ab = _outputStreams2[0].initialize(); // perform the second parse pass CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p2.parse(key); // store the last stream if not stored during the parse if (_ab != null) _outputStreams2[_outputIdx].store(); break; default: assert (false); } ParseStatus.update(_resultKey, DKV.get(key).length(), _phase); } catch (Exception e) { e.printStackTrace(); _error = e.getMessage(); } }