Esempio n. 1
0
 /**
  * Executes the phase one of the parser.
  *
  * <p>First phase detects the encoding and basic statistics of the parsed dataset.
  *
  * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on
  * per chunk basis.
  *
  * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets.
  *
  * @throws Exception
  */
 public void passOne(CsvParser.Setup setup) throws Exception {
   switch (_parserType) {
     case CSV:
       // precompute the parser setup, column setup and other settings
       byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024
       if (setup == null) setup = CsvParser.guessCsvSetup(bits);
       if (setup._data == null) {
         _error = "Unable to determine the separator or number of columns on the dataset";
         return;
       }
       _colNames = setup._data[0];
       setColumnNames(_colNames);
       _skipFirstLine = setup._header;
       // set the separator
       this._sep = setup._separator;
       // if parsing value array, initialize the nrows array
       if (_sourceDataset._isArray != 0) {
         ValueArray ary = ValueArray.value(_sourceDataset);
         _nrows = new int[(int) ary.chunks()];
       }
       // launch the distributed parser on its chunks.
       this.invoke(_sourceDataset._key);
       break;
     case XLS:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser p = new XlsParser(this);
       p.parse(_sourceDataset._key);
       --_myrows; // do not count the header
       break;
     case XLSX:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser px = new XlsxParser(this);
       px.parse(_sourceDataset._key);
       break;
     default:
       throw new Error("NOT IMPLEMENTED");
   }
   // calculate proper numbers of rows for the chunks
   if (_nrows != null) {
     _numRows = 0;
     for (int i = 0; i < _nrows.length; ++i) {
       _numRows += _nrows[i];
       _nrows[i] = _numRows;
     }
   } else {
     _numRows = _myrows;
   }
   // normalize mean
   for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]);
 }
Esempio n. 2
0
 /**
  * Executes the phase two of the parser task.
  *
  * <p>In phase two the data is encoded to the final VA, which is then created properly at the end.
  *
  * <p>For CSV launches the distributed computation.
  *
  * <p>For XLS and XLSX parsers computes all the chunks itself as there is no option for their
  * distributed processing.
  */
 public void passTwo() throws Exception {
   switch (_parserType) {
     case CSV:
       // for CSV parser just launch the distributed parser on the chunks
       // again
       this.invoke(_sourceDataset._key);
       break;
     case XLS:
     case XLSX:
       // initialize statistics - invalid rows, sigma and row size
       phaseTwoInitialize();
       // create the output streams
       _outputStreams2 = createRecords(0, _myrows);
       assert (_outputStreams2.length > 0);
       _ab = _outputStreams2[0].initialize();
       // perform the second parse pass
       CustomParser p =
           (_parserType == CustomParser.Type.XLS) ? new XlsParser(this) : new XlsxParser(this);
       p.parse(_sourceDataset._key);
       // store the last stream if not stored during the parse
       if (_ab != null) _outputStreams2[_outputIdx].store();
       break;
     default:
       throw new Error("NOT IMPLEMENTED");
   }
   // normalize sigma
   for (int i = 0; i < _ncolumns; ++i)
     _sigma[i] = Math.sqrt(_sigma[i] / (_numRows - _invalidValues[i]));
   createValueArrayHeader();
 }