/** * Executes the phase one of the parser. * * <p>First phase detects the encoding and basic statistics of the parsed dataset. * * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on * per chunk basis. * * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets. * * @throws Exception */ public void passOne(CsvParser.Setup setup) throws Exception { switch (_parserType) { case CSV: // precompute the parser setup, column setup and other settings byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024 if (setup == null) setup = CsvParser.guessCsvSetup(bits); if (setup._data == null) { _error = "Unable to determine the separator or number of columns on the dataset"; return; } _colNames = setup._data[0]; setColumnNames(_colNames); _skipFirstLine = setup._header; // set the separator this._sep = setup._separator; // if parsing value array, initialize the nrows array if (_sourceDataset._isArray != 0) { ValueArray ary = ValueArray.value(_sourceDataset); _nrows = new int[(int) ary.chunks()]; } // launch the distributed parser on its chunks. this.invoke(_sourceDataset._key); break; case XLS: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser p = new XlsParser(this); p.parse(_sourceDataset._key); --_myrows; // do not count the header break; case XLSX: // XLS parsing is not distributed, just obtain the value stream and // run the parser CustomParser px = new XlsxParser(this); px.parse(_sourceDataset._key); break; default: throw new Error("NOT IMPLEMENTED"); } // calculate proper numbers of rows for the chunks if (_nrows != null) { _numRows = 0; for (int i = 0; i < _nrows.length; ++i) { _numRows += _nrows[i]; _nrows[i] = _numRows; } } else { _numRows = _myrows; } // normalize mean for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]); }
/** * Executes the phase two of the parser task. * * <p>In phase two the data is encoded to the final VA, which is then created properly at the end. * * <p>For CSV launches the distributed computation. * * <p>For XLS and XLSX parsers computes all the chunks itself as there is no option for their * distributed processing. */ public void passTwo() throws Exception { switch (_parserType) { case CSV: // for CSV parser just launch the distributed parser on the chunks // again this.invoke(_sourceDataset._key); break; case XLS: case XLSX: // initialize statistics - invalid rows, sigma and row size phaseTwoInitialize(); // create the output streams _outputStreams2 = createRecords(0, _myrows); assert (_outputStreams2.length > 0); _ab = _outputStreams2[0].initialize(); // perform the second parse pass CustomParser p = (_parserType == CustomParser.Type.XLS) ? new XlsParser(this) : new XlsxParser(this); p.parse(_sourceDataset._key); // store the last stream if not stored during the parse if (_ab != null) _outputStreams2[_outputIdx].store(); break; default: throw new Error("NOT IMPLEMENTED"); } // normalize sigma for (int i = 0; i < _ncolumns; ++i) _sigma[i] = Math.sqrt(_sigma[i] / (_numRows - _invalidValues[i])); createValueArrayHeader(); }