예제 #1
0
 /**
  * Executes the phase two of the parser task.
  *
  * <p>In phase two the data is encoded to the final VA, which is then created properly at the end.
  *
  * <p>For CSV launches the distributed computation.
  *
  * <p>For XLS and XLSX parsers computes all the chunks itself as there is no option for their
  * distributed processing.
  */
 public void passTwo() throws Exception {
   switch (_parserType) {
     case CSV:
       // for CSV parser just launch the distributed parser on the chunks
       // again
       this.invoke(_sourceDataset._key);
       break;
     case XLS:
     case XLSX:
       // initialize statistics - invalid rows, sigma and row size
       phaseTwoInitialize();
       // create the output streams
       _outputStreams2 = createRecords(0, _myrows);
       assert (_outputStreams2.length > 0);
       _ab = _outputStreams2[0].initialize();
       // perform the second parse pass
       CustomParser p =
           (_parserType == CustomParser.Type.XLS) ? new XlsParser(this) : new XlsxParser(this);
       p.parse(_sourceDataset._key);
       // store the last stream if not stored during the parse
       if (_ab != null) _outputStreams2[_outputIdx].store();
       break;
     default:
       throw new Error("NOT IMPLEMENTED");
   }
   // normalize sigma
   for (int i = 0; i < _ncolumns; ++i)
     _sigma[i] = Math.sqrt(_sigma[i] / (_numRows - _invalidValues[i]));
   createValueArrayHeader();
 }
예제 #2
0
  /**
   * Map function for distributed parsing of the CSV files.
   *
   * <p>In first phase it calculates the min, max, means, encodings and other statistics about the
   * dataset, determines the number of columns.
   *
   * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal
   * sized chunks.
   */
  @Override
  public void map(Key key) {
    try {
      Key aryKey = null;
      boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK;
      boolean skipFirstLine = _skipFirstLine;
      if (arraylet) {
        aryKey = ValueArray.getArrayKey(key);
        _chunkId = ValueArray.getChunkIndex(key);
        skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0);
      }
      switch (_phase) {
        case ONE:
          assert (_ncolumns != 0);
          // initialize the column statistics
          phaseOneInitialize();
          // perform the parse
          CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p.parse(key);
          if (arraylet) {
            long idx = ValueArray.getChunkIndex(key);
            int idx2 = (int) idx;
            assert idx2 == idx;
            assert (_nrows[idx2] == 0)
                : idx
                    + ": "
                    + Arrays.toString(_nrows)
                    + " ("
                    + _nrows[idx2]
                    + " -- "
                    + _myrows
                    + ")";
            _nrows[idx2] = _myrows;
          }
          break;
        case TWO:
          assert (_ncolumns != 0);
          // initialize statistics - invalid rows, sigma and row size
          phaseTwoInitialize();
          // calculate the first row and the number of rows to parse
          int firstRow = 0;
          int lastRow = _myrows;
          _myrows = 0;
          if (arraylet) {
            long origChunkIdx = ValueArray.getChunkIndex(key);
            firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1];
            lastRow = _nrows[(int) origChunkIdx];
          }
          int rowsToParse = lastRow - firstRow;
          // create the output streams
          _outputStreams2 = createRecords(firstRow, rowsToParse);
          assert (_outputStreams2.length > 0);
          _ab = _outputStreams2[0].initialize();
          // perform the second parse pass
          CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p2.parse(key);
          // store the last stream if not stored during the parse
          if (_ab != null) _outputStreams2[_outputIdx].store();
          break;
        default:
          assert (false);
      }

      ParseStatus.update(_resultKey, DKV.get(key).length(), _phase);
    } catch (Exception e) {
      e.printStackTrace();
      _error = e.getMessage();
    }
  }