@Override public void reduce(DRemoteTask drt) { try { DParseTask other = (DParseTask) drt; if (_sigma == null) _sigma = other._sigma; if (_invalidValues == null) { _enums = other._enums; _min = other._min; _max = other._max; _mean = other._mean; _sigma = other._sigma; _scale = other._scale; _colTypes = other._colTypes; _nrows = other._nrows; _invalidValues = other._invalidValues; } else { if (_phase == Pass.ONE) { if (_nrows != other._nrows) for (int i = 0; i < _nrows.length; ++i) _nrows[i] += other._nrows[i]; for (int i = 0; i < _ncolumns; ++i) { if (_enums[i] != other._enums[i]) _enums[i].merge(other._enums[i]); if (other._min[i] < _min[i]) _min[i] = other._min[i]; if (other._max[i] > _max[i]) _max[i] = other._max[i]; if (other._scale[i] < _scale[i]) _scale[i] = other._scale[i]; if (other._colTypes[i] > _colTypes[i]) _colTypes[i] = other._colTypes[i]; _mean[i] += other._mean[i]; } } else if (_phase == Pass.TWO) { for (int i = 0; i < _ncolumns; ++i) _sigma[i] += other._sigma[i]; } else assert false : "unexpected _phase value:" + _phase; for (int i = 0; i < _ncolumns; ++i) _invalidValues[i] += other._invalidValues[i]; } _myrows += other._myrows; if (_error == null) _error = other._error; else if (other._error != null) _error = _error + "\n" + other._error; } catch (Exception e) { e.printStackTrace(); } }
/** * Map function for distributed parsing of the CSV files. * * <p>In first phase it calculates the min, max, means, encodings and other statistics about the * dataset, determines the number of columns. * * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal * sized chunks. */ @Override public void map(Key key) { try { Key aryKey = null; boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK; boolean skipFirstLine = _skipFirstLine; if (arraylet) { aryKey = ValueArray.getArrayKey(key); _chunkId = ValueArray.getChunkIndex(key); skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0); } switch (_phase) { case ONE: assert (_ncolumns != 0); // initialize the column statistics phaseOneInitialize(); // perform the parse CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p.parse(key); if (arraylet) { long idx = ValueArray.getChunkIndex(key); int idx2 = (int) idx; assert idx2 == idx; assert (_nrows[idx2] == 0) : idx + ": " + Arrays.toString(_nrows) + " (" + _nrows[idx2] + " -- " + _myrows + ")"; _nrows[idx2] = _myrows; } break; case TWO: assert (_ncolumns != 0); // initialize statistics - invalid rows, sigma and row size phaseTwoInitialize(); // calculate the first row and the number of rows to parse int firstRow = 0; int lastRow = _myrows; _myrows = 0; if (arraylet) { long origChunkIdx = ValueArray.getChunkIndex(key); firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1]; lastRow = _nrows[(int) origChunkIdx]; } int rowsToParse = lastRow - firstRow; // create the output streams _outputStreams2 = createRecords(firstRow, rowsToParse); assert (_outputStreams2.length > 0); _ab = _outputStreams2[0].initialize(); // perform the second parse pass CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine); p2.parse(key); // store the last stream if not stored during the parse if (_ab != null) _outputStreams2[_outputIdx].store(); break; default: assert (false); } ParseStatus.update(_resultKey, DKV.get(key).length(), _phase); } catch (Exception e) { e.printStackTrace(); _error = e.getMessage(); } }