/** * Creates the value header based on the calculated columns. * * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed * dataset. */ private void createValueArrayHeader() { assert (_phase == Pass.TWO); Column[] cols = new Column[_ncolumns]; int off = 0; for (int i = 0; i < cols.length; ++i) { cols[i] = new Column(); cols[i]._n = _numRows - _invalidValues[i]; cols[i]._base = _bases[i]; assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i]) : "scale out of bounds!, col = " + i + ", scale = " + _scale[i]; cols[i]._scale = (char) pow10i(-_scale[i]); cols[i]._off = (char) off; cols[i]._size = (byte) COL_SIZES[_colTypes[i]]; cols[i]._domain = _colDomains[i]; cols[i]._max = _max[i]; cols[i]._min = _min[i]; cols[i]._mean = _mean[i]; cols[i]._sigma = _sigma[i]; cols[i]._name = _colNames[i]; off += Math.abs(cols[i]._size); } // let any pending progress reports finish DKV.write_barrier(); // finally make the value array header ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols); UKV.put(_resultKey, ary.value()); }
@Override public Value lazyArrayChunk(final Key key) { final Key arykey = ValueArray.getArrayKey(key); // From the base file key final long off = (_iceRoot != null) ? 0 : ValueArray.getChunkOffset(key); // The offset final Path p = (_iceRoot != null) ? new Path(_iceRoot, getIceName(key, (byte) 'V')) : new Path(arykey.toString()); final Size sz = new Size(); run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); long rem = fs.getFileStatus(p).getLen() - off; sz._value = (rem > ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem; return null; } }, true, 0); Value val = new Value(key, sz._value, Value.HDFS); val.setdsk(); // But its already on disk. return val; }
// Read up to 'len' bytes of Value. Value should already be persisted to // disk. A racing delete can trigger a failure where we get a null return, // but no crash (although one could argue that a racing load&delete is a bug // no matter what). @Override public byte[] load(Value v) { long skip = 0; Key k = v._key; // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key } if (k._kb[0] == Key.DVEC) { skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset } try { FileInputStream s = null; try { s = new FileInputStream(getFileForKey(k)); FileChannel fc = s.getChannel(); fc.position(skip); AutoBuffer ab = new AutoBuffer(fc, true, Value.NFS); byte[] b = ab.getA1(v._max); ab.close(); assert v.isPersisted(); return b; } finally { if (s != null) s.close(); } } catch (IOException e) { // Broken disk / short-file??? H2O.ignore(e); return null; } }
protected void testScalarExpression(String expr, double result) { Key key = executeExpression(expr); ValueArray va = ValueArray.value(key); assertEquals(va.numRows(), 1); assertEquals(va.numCols(), 1); assertEquals(result, va.datad(0, 0), 0.0); UKV.remove(key); }
// Test kaggle/creditsample-test data @org.junit.Test public void kaggle_credit() { Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz"); UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED")); UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED")); ValueArray val = DKV.get(okey).get(); // Check parsed dataset final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20]; assertEquals("Number of chunks", n, val.chunks()); assertEquals("Number of rows", 150000, val.numRows()); assertEquals("Number of cols", 12, val.numCols()); // setup default values for DRF int ntrees = 3; int depth = 30; int gini = StatType.GINI.ordinal(); int seed = 42; StatType statType = StatType.values()[gini]; final int cols[] = new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1 // Start the distributed Random Forest final Key modelKey = Key.make("model"); DRFJob result = hex.rf.DRF.execute( modelKey, cols, val, ntrees, depth, 1024, statType, seed, true, null, -1, Sampling.Strategy.RANDOM, 1.0f, null, 0, 0, false); // Wait for completion on all nodes RFModel model = result.get(); assertEquals("Number of classes", 2, model.classes()); assertEquals("Number of trees", ntrees, model.size()); model.deleteKeys(); UKV.remove(modelKey); UKV.remove(okey); }
/** * Executes the phase one of the parser. * * <p>First phase detects the encoding and basic statistics of the parsed dataset. * * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on * per chunk basis. * * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets. * * @throws Exception */ public void passOne(CsvParser.Setup setup) throws Exception { switch (_parserType) { case CSV: // precompute the parser setup, column setup and other settings byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024 if (setup == null) setup = CsvParser.guessCsvSetup(bits); if (setup._data == null) { _error = "Unable to determine the separator or number of columns on the dataset"; return; } _colNames = setup._data[0]; setColumnNames(_colNames); _skipFirstLine = setup._header; // set the separator this._sep = setup._separator; // if parsing value array, initialize the nrows array if (_sourceDataset._isArray != 0) { ValueArray ary = ValueArray.value(_sourceDataset); _nrows = new int[(int) ary.chunks()]; } // launch the distributed parser on its chunks. this.invoke(_sourceDataset._key); break; case XLS: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser p = new XlsParser(this); p.parse(_sourceDataset._key); --_myrows; // do not count the header break; case XLSX: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser px = new XlsxParser(this); px.parse(_sourceDataset._key); break; default: throw new Error("NOT IMPLEMENTED"); } // calculate proper numbers of rows for the chunks if (_nrows != null) { _numRows = 0; for (int i = 0; i < _nrows.length; ++i) { _numRows += _nrows[i]; _nrows[i] = _numRows; } } else { _numRows = _myrows; } // normalize mean for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]); }
@Override public Value lazyArrayChunk(Key key) { Key arykey = ValueArray.getArrayKey(key); // From the base file key long off = ValueArray.getChunkOffset(key); // The offset long size = getFileForKey(arykey).length(); long rem = size - off; // the last chunk can be fat, so it got packed into the earlier chunk if (rem < ValueArray.CHUNK_SZ && off > 0) return null; int sz = (rem >= ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem; Value val = new Value(key, sz, Value.NFS); val.setdsk(); // But its already on disk. return val; }
public static Response redirect(JsonObject fromPageResponse, Key rfModelKey) { RFModel rfModel = DKV.get(rfModelKey).get(); ValueArray data = DKV.get(rfModel._dataKey).get(); return redirect( fromPageResponse, null, rfModelKey, rfModel._dataKey, rfModel._totalTrees, data.numCols() - 1, null, true, false); }
/*@org.junit.Test*/ public void covtype() { // Key okey = loadAndParseFile("covtype.hex", "smalldata/covtype/covtype.20k.data"); // Key okey = loadAndParseFile("covtype.hex", "../datasets/UCI/UCI-large/covtype/covtype.data"); // Key okey = loadAndParseFile("covtype.hex", "/home/0xdiag/datasets/standard/covtype.data"); Key okey = loadAndParseFile("mnist.hex", "smalldata/mnist/mnist8m.10k.csv.gz"); // Key okey = loadAndParseFile("mnist.hex", "/home/0xdiag/datasets/mnist/mnist8m.csv"); ValueArray val = UKV.get(okey); // setup default values for DRF int ntrees = 8; int depth = 999; int gini = StatType.ENTROPY.ordinal(); int seed = 42; StatType statType = StatType.values()[gini]; final int cols[] = new int[val.numCols()]; for (int i = 1; i < cols.length; i++) cols[i] = i - 1; cols[cols.length - 1] = 0; // Class is in column 0 for mnist // Start the distributed Random Forest final Key modelKey = Key.make("model"); DRFJob result = hex.rf.DRF.execute( modelKey, cols, val, ntrees, depth, 1024, statType, seed, true, null, -1, Sampling.Strategy.RANDOM, 1.0f, null, 0, 0, false); // Wait for completion on all nodes RFModel model = result.get(); assertEquals("Number of classes", 10, model.classes()); assertEquals("Number of trees", ntrees, model.size()); model.deleteKeys(); UKV.remove(modelKey); UKV.remove(okey); }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; final Path p; if (_iceRoot != null) { p = new Path(_iceRoot, getIceName(v)); } else { // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key if (k.toString().endsWith(Extensions.HEX)) { // Hex file? int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header? skip += value_len; } } p = new Path(k.toString()); } final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
/** * Stores the stream to its chunk using the atomic union. After the data from the stream is * stored, its memory is freed up. */ public void store() { assert _ab.eof(); Key k = ValueArray.getChunkKey(_chunkIndex, _resultKey); AtomicUnion u = new AtomicUnion(_ab.bufClose(), _chunkOffset); alsoBlockFor(u.fork(k)); _ab = null; // free mem }
@Override protected Response serve() { InputStream s = null; String urlStr = _url.value(); try { // if( urlStr.startsWith("file://") ) { // urlStr = urlStr.substring("file://".length()); if (urlStr.startsWith("file:///")) { urlStr = urlStr.substring("file:///".length()); File f = new File(urlStr); // urlStr = "file://"+f.getCanonicalPath(); urlStr = "file:///" + f.getCanonicalPath(); } URL url = new URL(urlStr); Key k = _key.value(); if (k == null) k = Key.make(urlStr); s = url.openStream(); if (s == null) return Response.error("Unable to open stream to URL " + url.toString()); ValueArray.readPut(k, s); JsonObject json = new JsonObject(); json.addProperty(KEY, k.toString()); json.addProperty(URL, urlStr); Response r = Response.done(json); r.setBuilder(KEY, new KeyElementBuilder()); return r; } catch (IllegalArgumentException e) { return Response.error("Not a valid key: " + urlStr); } catch (IOException e) { return Response.error(e); } finally { Closeables.closeQuietly(s); } }
public static String store2Hdfs(Key srcKey) { assert srcKey._kb[0] != Key.ARRAYLET_CHUNK; assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name Value v = DKV.get(srcKey); if (v == null) return "Key " + srcKey + " not found"; if (!v.isArray()) { // Simple chunk? v.setHdfs(); // Set to HDFS and be done return null; // Success } // For ValueArrays, make the .hex header ValueArray ary = v.get(); String err = PersistHdfs.freeze(srcKey, ary); if (err != null) return err; // The task managing which chunks to write next, // store in a known key TaskStore2HDFS ts = new TaskStore2HDFS(srcKey); Key selfKey = ts.selfKey(); UKV.put(selfKey, ts); // Then start writing chunks in-order with the zero chunk H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node(); RPC.call(ts.chunkHome(), ts); // Watch the progress key until it gets removed or an error appears long idx = 0; while ((ts = UKV.get(selfKey, TaskStore2HDFS.class)) != null) { if (ts._indexFrom != idx) { System.out.print(" " + idx + "/" + ary.chunks()); idx = ts._indexFrom; } if (ts._err != null) { // Found an error? UKV.remove(selfKey); // Cleanup & report return ts._err; } try { Thread.sleep(100); } catch (InterruptedException e) { } } System.out.println(" " + ary.chunks() + "/" + ary.chunks()); // PersistHdfs.refreshHDFSKeys(); return null; }
protected void testKeyValues( Key k, double n1, double n2, double n3, double nx3, double nx2, double nx1) { ValueArray v = ValueArray.value(k); assertEquals(v.datad(0, 0), n1, 0.0); assertEquals(v.datad(1, 0), n2, 0.0); assertEquals(v.datad(2, 0), n3, 0.0); assertEquals(v.datad(v.numRows() - 3, 0), nx3, 0.0); assertEquals(v.datad(v.numRows() - 2, 0), nx2, 0.0); assertEquals(v.datad(v.numRows() - 1, 0), nx1, 0.0); }
public static KMeansScore score(KMeansModel model, ValueArray ary) { KMeansScore kms = new KMeansScore(); kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); return kms; }
@Override public void map(Key key) { _rows = new long[_clusters.length]; _dist = new double[_clusters.length]; assert key.home(); ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); int rows = va.rpc(ValueArray.getChunkIndex(key)); double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); _rows[cd._cluster]++; _dist[cd._cluster] += cd._dist; } _arykey = null; _cols = null; _clusters = null; }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key } else if (k._kb[0] == Key.DVEC) { skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset } final Path p = _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v)); final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
@Override public void compute() { String path = null; // getPathFromValue(val); ValueArray ary = ValueArray.value(_arykey); Key self = selfKey(); while (_indexFrom < ary.chunks()) { Key ckey = ary.getChunkKey(_indexFrom++); if (!ckey.home()) { // Next chunk not At Home? RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk return; } Value val = DKV.get(ckey); // It IS home, so get the data _err = PersistHdfs.appendChunk(_arykey, val); if (_err != null) return; UKV.put(self, this); // Update the progress/self key } // We did the last chunk. Removing the selfKey is the signal to the web // thread that All Done. UKV.remove(self); }
private void updateClusters( int[] clusters, int count, long chunk, long numrows, int rpc, long updatedRow) { final int offset = (int) (updatedRow - (rpc * chunk)); final Key chunkKey = ValueArray.getChunkKey(chunk, _job.dest()); final int[] message; if (count == clusters.length) message = clusters; else { message = new int[count]; System.arraycopy(clusters, 0, message, 0, message.length); } final int rows = ValueArray.rpc(chunk, rpc, numrows); new Atomic() { @Override public Value atomic(Value val) { assert val == null || val._key.equals(chunkKey); AutoBuffer b = new AutoBuffer(rows * ROW_SIZE); if (val != null) b._bb.put(val.memOrLoad()); for (int i = 0; i < message.length; i++) b.put4((offset + i) * 4, message[i]); b.position(b.limit()); return new Value(chunkKey, b.buf()); } }.invoke(chunkKey); }
public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) { final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest); new ValueArray(dest, 0).delete_and_lock(job.self()); final H2OCountedCompleter fjtask = new H2OCountedCompleter() { @Override public void compute2() { KMeansApply kms = new KMeansApply(); kms._job = job; kms._arykey = ary._key; kms._cols = model.columnMapping(ary.colNames()); kms._clusters = model._clusters; kms._normalized = model._normalized; kms.invoke(ary._key); Column c = new Column(); c._name = Constants.RESPONSE; c._size = ROW_SIZE; c._scale = 1; c._min = 0; c._max = model._clusters.length; c._mean = Double.NaN; c._sigma = Double.NaN; c._domain = null; c._n = ary.numRows(); ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c}); res.unlock(job.self()); job.remove(); tryComplete(); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { job.onException(ex); return super.onExceptionalCompletion(ex, caller); } }; job.start(fjtask); H2O.submitTask(fjtask); return job; }
private H2ONode chunkHome() { return ValueArray.getChunkKey(_indexFrom, _arykey).home_node(); }
/** * Creates a new ValueArray with classes. New ValueArray is not aligned with source one * unfortunately so have to send results to each chunk owner using Atomic. */ @Override public void map(Key key) { assert key.home(); if (Job.isRunning(_job.self())) { ValueArray va = DKV.get(_arykey).get(); AutoBuffer bits = va.getChunk(key); long startRow = va.startRow(ValueArray.getChunkIndex(key)); int rows = va.rpc(ValueArray.getChunkIndex(key)); int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE); long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE); long updatedChk = chunk; long updatedRow = startRow; double[] values = new double[_cols.length - 1]; ClusterDist cd = new ClusterDist(); int[] clusters = new int[rows]; int count = 0; for (int row = 0; row < rows; row++) { KMeans.datad(va, bits, row, _cols, _normalized, values); KMeans.closest(_clusters, values, cd); chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE); if (chunk != updatedChk) { updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow); updatedChk = chunk; updatedRow = startRow + row; count = 0; } clusters[count++] = cd._cluster; } if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow); _job.updateProgress(1); } _job = null; _arykey = null; _cols = null; _clusters = null; }
public void testDataFrameStructure(Key k, int rows, int cols) { ValueArray v = ValueArray.value(k); assertEquals(v.numRows(), rows); assertEquals(v.numCols(), cols); }
/** * Simple GLM wrapper to enable launching GLM from command line. * * <p>Example input: java -jar target/h2o.jar -name=test -runMethod water.util.GLMRunner * -file=smalldata/logreg/prostate.csv -y=CAPSULE -family=binomial * * @param args * @throws InterruptedException */ public static void main(String[] args) throws InterruptedException { try { GLMArgs ARGS = new GLMArgs(); new Arguments(args).extract(ARGS); System.out.println("==================<GLMRunner START>==================="); ValueArray ary = Utils.loadAndParseKey(ARGS.file); int ycol; try { ycol = Integer.parseInt(ARGS.y); } catch (NumberFormatException e) { ycol = ary.getColumnIds(new String[] {ARGS.y})[0]; } int ncols = ary.numCols(); if (ycol < 0 || ycol >= ary.numCols()) { System.err.println("invalid y column: " + ycol); H2O.exit(-1); } int[] xcols; if (ARGS.xs.equalsIgnoreCase("all")) { xcols = new int[ncols - 1]; for (int i = 0; i < ycol; ++i) xcols[i] = i; for (int i = ycol; i < ncols - 1; ++i) xcols[i] = i + 1; } else { System.out.println("xs = " + ARGS.xs); String[] names = ARGS.xs.split(","); xcols = new int[names.length]; try { for (int i = 0; i < names.length; ++i) xcols[i] = Integer.valueOf(names[i]); } catch (NumberFormatException e) { xcols = ary.getColumnIds(ARGS.xs.split(",")); } } for (int x : xcols) if (x < 0) { System.err.println("Invalid predictor specification " + ARGS.xs); H2O.exit(-1); } GLMJob j = DGLM.startGLMJob( DGLM.getData(ary, xcols, ycol, null, true), new ADMMSolver(ARGS.lambda, ARGS._alpha), new GLMParams(Family.valueOf(ARGS.family)), null, ARGS.xval, true); System.out.print("[GLM] computing model..."); int progress = 0; while (!j.isDone()) { int p = (int) (100 * j.progress()); int dots = p - progress; progress = p; for (int i = 0; i < dots; ++i) System.out.print('.'); Thread.sleep(250); } Log.debug(Sys.GENLM, "DONE."); GLMModel m = j.get(); String[] colnames = ary.colNames(); System.out.println("Intercept" + " = " + m._beta[ncols - 1]); for (int i = 0; i < xcols.length; ++i) { System.out.println(colnames[i] + " = " + m._beta[i]); } } catch (Throwable t) { Log.err(t); } finally { // we're done. shutdown the cloud Log.debug(Sys.GENLM, "==================<GLMRunner DONE>==================="); UDPRebooted.suicide(UDPRebooted.T.shutdown, H2O.SELF); } }
/** * Map function for distributed parsing of the CSV files. * * <p>In first phase it calculates the min, max, means, encodings and other statistics about the * dataset, determines the number of columns. * * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal * sized chunks. */ @Override public void map(Key key) { try { Key aryKey = null; boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK; boolean skipFirstLine = _skipFirstLine; if (arraylet) { aryKey = ValueArray.getArrayKey(key); _chunkId = ValueArray.getChunkIndex(key); skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0); } switch (_phase) { case ONE: assert (_ncolumns != 0); // initialize the column statistics phaseOneInitialize(); // perform the parse CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p.parse(key); if (arraylet) { long idx = ValueArray.getChunkIndex(key); int idx2 = (int) idx; assert idx2 == idx; assert (_nrows[idx2] == 0) : idx + ": " + Arrays.toString(_nrows) + " (" + _nrows[idx2] + " -- " + _myrows + ")"; _nrows[idx2] = _myrows; } break; case TWO: assert (_ncolumns != 0); // initialize statistics - invalid rows, sigma and row size phaseTwoInitialize(); // calculate the first row and the number of rows to parse int firstRow = 0; int lastRow = _myrows; _myrows = 0; if (arraylet) { long origChunkIdx = ValueArray.getChunkIndex(key); firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1]; lastRow = _nrows[(int) origChunkIdx]; } int rowsToParse = lastRow - firstRow; // create the output streams _outputStreams2 = createRecords(firstRow, rowsToParse); assert (_outputStreams2.length > 0); _ab = _outputStreams2[0].initialize(); // perform the second parse pass CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p2.parse(key); // store the last stream if not stored during the parse if (_ab != null) _outputStreams2[_outputIdx].store(); break; default: assert (false); } ParseStatus.update(_resultKey, DKV.get(key).length(), _phase); } catch (Exception e) { e.printStackTrace(); _error = e.getMessage(); } }