Exemple #1
0
 @Override
 public void reduce(DRemoteTask drt) {
   try {
     DParseTask other = (DParseTask) drt;
     if (_sigma == null) _sigma = other._sigma;
     if (_invalidValues == null) {
       _enums = other._enums;
       _min = other._min;
       _max = other._max;
       _mean = other._mean;
       _sigma = other._sigma;
       _scale = other._scale;
       _colTypes = other._colTypes;
       _nrows = other._nrows;
       _invalidValues = other._invalidValues;
     } else {
       if (_phase == Pass.ONE) {
         if (_nrows != other._nrows)
           for (int i = 0; i < _nrows.length; ++i) _nrows[i] += other._nrows[i];
         for (int i = 0; i < _ncolumns; ++i) {
           if (_enums[i] != other._enums[i]) _enums[i].merge(other._enums[i]);
           if (other._min[i] < _min[i]) _min[i] = other._min[i];
           if (other._max[i] > _max[i]) _max[i] = other._max[i];
           if (other._scale[i] < _scale[i]) _scale[i] = other._scale[i];
           if (other._colTypes[i] > _colTypes[i]) _colTypes[i] = other._colTypes[i];
           _mean[i] += other._mean[i];
         }
       } else if (_phase == Pass.TWO) {
         for (int i = 0; i < _ncolumns; ++i) _sigma[i] += other._sigma[i];
       } else assert false : "unexpected _phase value:" + _phase;
       for (int i = 0; i < _ncolumns; ++i) _invalidValues[i] += other._invalidValues[i];
     }
     _myrows += other._myrows;
     if (_error == null) _error = other._error;
     else if (other._error != null) _error = _error + "\n" + other._error;
   } catch (Exception e) {
     e.printStackTrace();
   }
 }
Exemple #2
0
  /**
   * Map function for distributed parsing of the CSV files.
   *
   * <p>In first phase it calculates the min, max, means, encodings and other statistics about the
   * dataset, determines the number of columns.
   *
   * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal
   * sized chunks.
   */
  @Override
  public void map(Key key) {
    try {
      Key aryKey = null;
      boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK;
      boolean skipFirstLine = _skipFirstLine;
      if (arraylet) {
        aryKey = ValueArray.getArrayKey(key);
        _chunkId = ValueArray.getChunkIndex(key);
        skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0);
      }
      switch (_phase) {
        case ONE:
          assert (_ncolumns != 0);
          // initialize the column statistics
          phaseOneInitialize();
          // perform the parse
          CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p.parse(key);
          if (arraylet) {
            long idx = ValueArray.getChunkIndex(key);
            int idx2 = (int) idx;
            assert idx2 == idx;
            assert (_nrows[idx2] == 0)
                : idx
                    + ": "
                    + Arrays.toString(_nrows)
                    + " ("
                    + _nrows[idx2]
                    + " -- "
                    + _myrows
                    + ")";
            _nrows[idx2] = _myrows;
          }
          break;
        case TWO:
          assert (_ncolumns != 0);
          // initialize statistics - invalid rows, sigma and row size
          phaseTwoInitialize();
          // calculate the first row and the number of rows to parse
          int firstRow = 0;
          int lastRow = _myrows;
          _myrows = 0;
          if (arraylet) {
            long origChunkIdx = ValueArray.getChunkIndex(key);
            firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1];
            lastRow = _nrows[(int) origChunkIdx];
          }
          int rowsToParse = lastRow - firstRow;
          // create the output streams
          _outputStreams2 = createRecords(firstRow, rowsToParse);
          assert (_outputStreams2.length > 0);
          _ab = _outputStreams2[0].initialize();
          // perform the second parse pass
          CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p2.parse(key);
          // store the last stream if not stored during the parse
          if (_ab != null) _outputStreams2[_outputIdx].store();
          break;
        default:
          assert (false);
      }

      ParseStatus.update(_resultKey, DKV.get(key).length(), _phase);
    } catch (Exception e) {
      e.printStackTrace();
      _error = e.getMessage();
    }
  }