Esempio n. 1
0
 /**
  * Creates the value header based on the calculated columns.
  *
  * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed
  * dataset.
  */
 private void createValueArrayHeader() {
   assert (_phase == Pass.TWO);
   Column[] cols = new Column[_ncolumns];
   int off = 0;
   for (int i = 0; i < cols.length; ++i) {
     cols[i] = new Column();
     cols[i]._n = _numRows - _invalidValues[i];
     cols[i]._base = _bases[i];
     assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i])
         : "scale out of bounds!, col = " + i + ", scale = " + _scale[i];
     cols[i]._scale = (char) pow10i(-_scale[i]);
     cols[i]._off = (char) off;
     cols[i]._size = (byte) COL_SIZES[_colTypes[i]];
     cols[i]._domain = _colDomains[i];
     cols[i]._max = _max[i];
     cols[i]._min = _min[i];
     cols[i]._mean = _mean[i];
     cols[i]._sigma = _sigma[i];
     cols[i]._name = _colNames[i];
     off += Math.abs(cols[i]._size);
   }
   // let any pending progress reports finish
   DKV.write_barrier();
   // finally make the value array header
   ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols);
   UKV.put(_resultKey, ary.value());
 }
Esempio n. 2
0
 protected void testScalarExpression(String expr, double result) {
   Key key = executeExpression(expr);
   ValueArray va = ValueArray.value(key);
   assertEquals(va.numRows(), 1);
   assertEquals(va.numCols(), 1);
   assertEquals(result, va.datad(0, 0), 0.0);
   UKV.remove(key);
 }
Esempio n. 3
0
 protected void testKeyValues(
     Key k, double n1, double n2, double n3, double nx3, double nx2, double nx1) {
   ValueArray v = ValueArray.value(k);
   assertEquals(v.datad(0, 0), n1, 0.0);
   assertEquals(v.datad(1, 0), n2, 0.0);
   assertEquals(v.datad(2, 0), n3, 0.0);
   assertEquals(v.datad(v.numRows() - 3, 0), nx3, 0.0);
   assertEquals(v.datad(v.numRows() - 2, 0), nx2, 0.0);
   assertEquals(v.datad(v.numRows() - 1, 0), nx1, 0.0);
 }
Esempio n. 4
0
 /**
  * Executes the phase one of the parser.
  *
  * <p>First phase detects the encoding and basic statistics of the parsed dataset.
  *
  * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on
  * per chunk basis.
  *
  * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets.
  *
  * @throws Exception
  */
 public void passOne(CsvParser.Setup setup) throws Exception {
   switch (_parserType) {
     case CSV:
       // precompute the parser setup, column setup and other settings
       byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024
       if (setup == null) setup = CsvParser.guessCsvSetup(bits);
       if (setup._data == null) {
         _error = "Unable to determine the separator or number of columns on the dataset";
         return;
       }
       _colNames = setup._data[0];
       setColumnNames(_colNames);
       _skipFirstLine = setup._header;
       // set the separator
       this._sep = setup._separator;
       // if parsing value array, initialize the nrows array
       if (_sourceDataset._isArray != 0) {
         ValueArray ary = ValueArray.value(_sourceDataset);
         _nrows = new int[(int) ary.chunks()];
       }
       // launch the distributed parser on its chunks.
       this.invoke(_sourceDataset._key);
       break;
     case XLS:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser p = new XlsParser(this);
       p.parse(_sourceDataset._key);
       --_myrows; // do not count the header
       break;
     case XLSX:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser px = new XlsxParser(this);
       px.parse(_sourceDataset._key);
       break;
     default:
       throw new Error("NOT IMPLEMENTED");
   }
   // calculate proper numbers of rows for the chunks
   if (_nrows != null) {
     _numRows = 0;
     for (int i = 0; i < _nrows.length; ++i) {
       _numRows += _nrows[i];
       _nrows[i] = _numRows;
     }
   } else {
     _numRows = _myrows;
   }
   // normalize mean
   for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]);
 }
Esempio n. 5
0
  public static String store2Hdfs(Key srcKey) {
    assert srcKey._kb[0] != Key.ARRAYLET_CHUNK;
    assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name
    Value v = DKV.get(srcKey);
    if (v == null) return "Key " + srcKey + " not found";
    if (v._isArray == 0) { // Simple chunk?
      v.setHdfs(); // Set to HDFS and be done
      return null; // Success
    }

    // For ValueArrays, make the .hex header
    ValueArray ary = ValueArray.value(v);
    String err = PersistHdfs.freeze(srcKey, ary);
    if (err != null) return err;

    // The task managing which chunks to write next,
    // store in a known key
    TaskStore2HDFS ts = new TaskStore2HDFS(srcKey);
    Key selfKey = ts.selfKey();
    UKV.put(selfKey, ts);

    // Then start writing chunks in-order with the zero chunk
    H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node();
    RPC.call(ts.chunkHome(), ts);

    // Watch the progress key until it gets removed or an error appears
    long idx = 0;
    while (UKV.get(selfKey, ts) != null) {
      if (ts._indexFrom != idx) {
        System.out.print(" " + idx + "/" + ary.chunks());
        idx = ts._indexFrom;
      }
      if (ts._err != null) { // Found an error?
        UKV.remove(selfKey); // Cleanup & report
        return ts._err;
      }
      try {
        Thread.sleep(100);
      } catch (InterruptedException e) {
      }
    }
    System.out.println(" " + ary.chunks() + "/" + ary.chunks());

    // PersistHdfs.refreshHDFSKeys();
    return null;
  }
Esempio n. 6
0
  @Override
  public void compute() {
    String path = null; // getPathFromValue(val);
    ValueArray ary = ValueArray.value(_arykey);
    Key self = selfKey();

    while (_indexFrom < ary.chunks()) {
      Key ckey = ary.getChunkKey(_indexFrom++);
      if (!ckey.home()) { // Next chunk not At Home?
        RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk
        return;
      }
      Value val = DKV.get(ckey); // It IS home, so get the data
      _err = PersistHdfs.appendChunk(_arykey, val);
      if (_err != null) return;
      UKV.put(self, this); // Update the progress/self key
    }
    // We did the last chunk.  Removing the selfKey is the signal to the web
    // thread that All Done.
    UKV.remove(self);
  }
Esempio n. 7
0
 public void testDataFrameStructure(Key k, int rows, int cols) {
   ValueArray v = ValueArray.value(k);
   assertEquals(v.numRows(), rows);
   assertEquals(v.numCols(), cols);
 }