public static String store2Hdfs(Key srcKey) {
    assert srcKey._kb[0] != Key.ARRAYLET_CHUNK;
    assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name
    Value v = DKV.get(srcKey);
    if (v == null) return "Key " + srcKey + " not found";
    if (v._isArray == 0) { // Simple chunk?
      v.setHdfs(); // Set to HDFS and be done
      return null; // Success
    }

    // For ValueArrays, make the .hex header
    ValueArray ary = ValueArray.value(v);
    String err = PersistHdfs.freeze(srcKey, ary);
    if (err != null) return err;

    // The task managing which chunks to write next,
    // store in a known key
    TaskStore2HDFS ts = new TaskStore2HDFS(srcKey);
    Key selfKey = ts.selfKey();
    UKV.put(selfKey, ts);

    // Then start writing chunks in-order with the zero chunk
    H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node();
    RPC.call(ts.chunkHome(), ts);

    // Watch the progress key until it gets removed or an error appears
    long idx = 0;
    while (UKV.get(selfKey, ts) != null) {
      if (ts._indexFrom != idx) {
        System.out.print(" " + idx + "/" + ary.chunks());
        idx = ts._indexFrom;
      }
      if (ts._err != null) { // Found an error?
        UKV.remove(selfKey); // Cleanup & report
        return ts._err;
      }
      try {
        Thread.sleep(100);
      } catch (InterruptedException e) {
      }
    }
    System.out.println(" " + ary.chunks() + "/" + ary.chunks());

    // PersistHdfs.refreshHDFSKeys();
    return null;
  }
Exemple #2
0
  // Test kaggle/creditsample-test data
  @org.junit.Test
  public void kaggle_credit() {
    Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz");
    UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED"));
    UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED"));
    ValueArray val = DKV.get(okey).get();

    // Check parsed dataset
    final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20];
    assertEquals("Number of chunks", n, val.chunks());
    assertEquals("Number of rows", 150000, val.numRows());
    assertEquals("Number of cols", 12, val.numCols());

    // setup default values for DRF
    int ntrees = 3;
    int depth = 30;
    int gini = StatType.GINI.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] =
        new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
        hex.rf.DRF.execute(
            modelKey,
            cols,
            val,
            ntrees,
            depth,
            1024,
            statType,
            seed,
            true,
            null,
            -1,
            Sampling.Strategy.RANDOM,
            1.0f,
            null,
            0,
            0,
            false);
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 2, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

    model.deleteKeys();
    UKV.remove(modelKey);
    UKV.remove(okey);
  }
Exemple #3
0
 /**
  * Executes the phase one of the parser.
  *
  * <p>First phase detects the encoding and basic statistics of the parsed dataset.
  *
  * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on
  * per chunk basis.
  *
  * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets.
  *
  * @throws Exception
  */
 public void passOne(CsvParser.Setup setup) throws Exception {
   switch (_parserType) {
     case CSV:
       // precompute the parser setup, column setup and other settings
       byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024
       if (setup == null) setup = CsvParser.guessCsvSetup(bits);
       if (setup._data == null) {
         _error = "Unable to determine the separator or number of columns on the dataset";
         return;
       }
       _colNames = setup._data[0];
       setColumnNames(_colNames);
       _skipFirstLine = setup._header;
       // set the separator
       this._sep = setup._separator;
       // if parsing value array, initialize the nrows array
       if (_sourceDataset._isArray != 0) {
         ValueArray ary = ValueArray.value(_sourceDataset);
         _nrows = new int[(int) ary.chunks()];
       }
       // launch the distributed parser on its chunks.
       this.invoke(_sourceDataset._key);
       break;
     case XLS:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser p = new XlsParser(this);
       p.parse(_sourceDataset._key);
       --_myrows; // do not count the header
       break;
     case XLSX:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser px = new XlsxParser(this);
       px.parse(_sourceDataset._key);
       break;
     default:
       throw new Error("NOT IMPLEMENTED");
   }
   // calculate proper numbers of rows for the chunks
   if (_nrows != null) {
     _numRows = 0;
     for (int i = 0; i < _nrows.length; ++i) {
       _numRows += _nrows[i];
       _nrows[i] = _numRows;
     }
   } else {
     _numRows = _myrows;
   }
   // normalize mean
   for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]);
 }
Exemple #4
0
    public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) {
      final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest);
      new ValueArray(dest, 0).delete_and_lock(job.self());
      final H2OCountedCompleter fjtask =
          new H2OCountedCompleter() {
            @Override
            public void compute2() {
              KMeansApply kms = new KMeansApply();
              kms._job = job;
              kms._arykey = ary._key;
              kms._cols = model.columnMapping(ary.colNames());
              kms._clusters = model._clusters;
              kms._normalized = model._normalized;
              kms.invoke(ary._key);

              Column c = new Column();
              c._name = Constants.RESPONSE;
              c._size = ROW_SIZE;
              c._scale = 1;
              c._min = 0;
              c._max = model._clusters.length;
              c._mean = Double.NaN;
              c._sigma = Double.NaN;
              c._domain = null;
              c._n = ary.numRows();
              ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c});
              res.unlock(job.self());
              job.remove();
              tryComplete();
            }

            @Override
            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
              job.onException(ex);
              return super.onExceptionalCompletion(ex, caller);
            }
          };
      job.start(fjtask);
      H2O.submitTask(fjtask);
      return job;
    }
  @Override
  public void compute() {
    String path = null; // getPathFromValue(val);
    ValueArray ary = ValueArray.value(_arykey);
    Key self = selfKey();

    while (_indexFrom < ary.chunks()) {
      Key ckey = ary.getChunkKey(_indexFrom++);
      if (!ckey.home()) { // Next chunk not At Home?
        RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk
        return;
      }
      Value val = DKV.get(ckey); // It IS home, so get the data
      _err = PersistHdfs.appendChunk(_arykey, val);
      if (_err != null) return;
      UKV.put(self, this); // Update the progress/self key
    }
    // We did the last chunk.  Removing the selfKey is the signal to the web
    // thread that All Done.
    UKV.remove(self);
  }