// Read up to 'len' bytes of Value. Value should already be persisted to // disk. A racing delete can trigger a failure where we get a null return, // but no crash (although one could argue that a racing load&delete is a bug // no matter what). @Override public byte[] load(Value v) { long skip = 0; Key k = v._key; // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key } if (k._kb[0] == Key.DVEC) { skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset } try { FileInputStream s = null; try { s = new FileInputStream(getFileForKey(k)); FileChannel fc = s.getChannel(); fc.position(skip); AutoBuffer ab = new AutoBuffer(fc, true, Value.NFS); byte[] b = ab.getA1(v._max); ab.close(); assert v.isPersisted(); return b; } finally { if (s != null) s.close(); } } catch (IOException e) { // Broken disk / short-file??? H2O.ignore(e); return null; } }
@Override public Value lazyArrayChunk(final Key key) { final Key arykey = ValueArray.getArrayKey(key); // From the base file key final long off = (_iceRoot != null) ? 0 : ValueArray.getChunkOffset(key); // The offset final Path p = (_iceRoot != null) ? new Path(_iceRoot, getIceName(key, (byte) 'V')) : new Path(arykey.toString()); final Size sz = new Size(); run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); long rem = fs.getFileStatus(p).getLen() - off; sz._value = (rem > ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem; return null; } }, true, 0); Value val = new Value(key, sz._value, Value.HDFS); val.setdsk(); // But its already on disk. return val; }
@Override public Value lazyArrayChunk(Key key) { Key arykey = ValueArray.getArrayKey(key); // From the base file key long off = ValueArray.getChunkOffset(key); // The offset long size = getFileForKey(arykey).length(); long rem = size - off; // the last chunk can be fat, so it got packed into the earlier chunk if (rem < ValueArray.CHUNK_SZ && off > 0) return null; int sz = (rem >= ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem; Value val = new Value(key, sz, Value.NFS); val.setdsk(); // But its already on disk. return val; }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; final Path p; if (_iceRoot != null) { p = new Path(_iceRoot, getIceName(v)); } else { // Convert an arraylet chunk into a long-offset from the base file. if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key if (k.toString().endsWith(Extensions.HEX)) { // Hex file? int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header? skip += value_len; } } p = new Path(k.toString()); } final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
@Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; if (k._kb[0] == Key.ARRAYLET_CHUNK) { skip = ValueArray.getChunkOffset(k); // The offset k = ValueArray.getArrayKey(k); // From the base file key } else if (k._kb[0] == Key.DVEC) { skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset } final Path p = _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v)); final long skip_ = skip; run( new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: // s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same // condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; }
/** * Map function for distributed parsing of the CSV files. * * <p>In first phase it calculates the min, max, means, encodings and other statistics about the * dataset, determines the number of columns. * * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal * sized chunks. */ @Override public void map(Key key) { try { Key aryKey = null; boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK; boolean skipFirstLine = _skipFirstLine; if (arraylet) { aryKey = ValueArray.getArrayKey(key); _chunkId = ValueArray.getChunkIndex(key); skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0); } switch (_phase) { case ONE: assert (_ncolumns != 0); // initialize the column statistics phaseOneInitialize(); // perform the parse CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p.parse(key); if (arraylet) { long idx = ValueArray.getChunkIndex(key); int idx2 = (int) idx; assert idx2 == idx; assert (_nrows[idx2] == 0) : idx + ": " + Arrays.toString(_nrows) + " (" + _nrows[idx2] + " -- " + _myrows + ")"; _nrows[idx2] = _myrows; } break; case TWO: assert (_ncolumns != 0); // initialize statistics - invalid rows, sigma and row size phaseTwoInitialize(); // calculate the first row and the number of rows to parse int firstRow = 0; int lastRow = _myrows; _myrows = 0; if (arraylet) { long origChunkIdx = ValueArray.getChunkIndex(key); firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1]; lastRow = _nrows[(int) origChunkIdx]; } int rowsToParse = lastRow - firstRow; // create the output streams _outputStreams2 = createRecords(firstRow, rowsToParse); assert (_outputStreams2.length > 0); _ab = _outputStreams2[0].initialize(); // perform the second parse pass CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p2.parse(key); // store the last stream if not stored during the parse if (_ab != null) _outputStreams2[_outputIdx].store(); break; default: assert (false); } ParseStatus.update(_resultKey, DKV.get(key).length(), _phase); } catch (Exception e) { e.printStackTrace(); _error = e.getMessage(); } }