コード例 #1
ファイル: PersistHdfs.java プロジェクト: jayfans3/h2o
 public Value lazyArrayChunk(final Key key) {
   final Key arykey = ValueArray.getArrayKey(key); // From the base file key
   final long off = (_iceRoot != null) ? 0 : ValueArray.getChunkOffset(key); // The offset
   final Path p =
       (_iceRoot != null)
           ? new Path(_iceRoot, getIceName(key, (byte) 'V'))
           : new Path(arykey.toString());
   final Size sz = new Size();
       new Callable() {
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           long rem = fs.getFileStatus(p).getLen() - off;
           sz._value = (rem > ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem;
           return null;
   Value val = new Value(key, sz._value, Value.HDFS);
   val.setdsk(); // But its already on disk.
   return val;
コード例 #2
ファイル: DParseTask.java プロジェクト: patricktoohey/h2o
  * Creates the value header based on the calculated columns.
  * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed
  * dataset.
 private void createValueArrayHeader() {
   assert (_phase == Pass.TWO);
   Column[] cols = new Column[_ncolumns];
   int off = 0;
   for (int i = 0; i < cols.length; ++i) {
     cols[i] = new Column();
     cols[i]._n = _numRows - _invalidValues[i];
     cols[i]._base = _bases[i];
     assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i])
         : "scale out of bounds!, col = " + i + ", scale = " + _scale[i];
     cols[i]._scale = (char) pow10i(-_scale[i]);
     cols[i]._off = (char) off;
     cols[i]._size = (byte) COL_SIZES[_colTypes[i]];
     cols[i]._domain = _colDomains[i];
     cols[i]._max = _max[i];
     cols[i]._min = _min[i];
     cols[i]._mean = _mean[i];
     cols[i]._sigma = _sigma[i];
     cols[i]._name = _colNames[i];
     off += Math.abs(cols[i]._size);
   // let any pending progress reports finish
   // finally make the value array header
   ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols);
   UKV.put(_resultKey, ary.value());
コード例 #3
ファイル: PersistNFS.java プロジェクト: pragnesh/h2o
 // Read up to 'len' bytes of Value. Value should already be persisted to
 // disk.  A racing delete can trigger a failure where we get a null return,
 // but no crash (although one could argue that a racing load&delete is a bug
 // no matter what).
 public byte[] load(Value v) {
   long skip = 0;
   Key k = v._key;
   // Convert an arraylet chunk into a long-offset from the base file.
   if (k._kb[0] == Key.ARRAYLET_CHUNK) {
     skip = ValueArray.getChunkOffset(k); // The offset
     k = ValueArray.getArrayKey(k); // From the base file key
   if (k._kb[0] == Key.DVEC) {
     skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
   try {
     FileInputStream s = null;
     try {
       s = new FileInputStream(getFileForKey(k));
       FileChannel fc = s.getChannel();
       AutoBuffer ab = new AutoBuffer(fc, true, Value.NFS);
       byte[] b = ab.getA1(v._max);
       assert v.isPersisted();
       return b;
     } finally {
       if (s != null) s.close();
   } catch (IOException e) { // Broken disk / short-file???
     return null;
コード例 #4
ファイル: RBigDataTest.java プロジェクト: nadya1/h2o
 protected void testScalarExpression(String expr, double result) {
   Key key = executeExpression(expr);
   ValueArray va = ValueArray.value(key);
   assertEquals(va.numRows(), 1);
   assertEquals(va.numCols(), 1);
   assertEquals(result, va.datad(0, 0), 0.0);
コード例 #5
ファイル: RandomForestTest.java プロジェクト: NidhiMehta/h2o
  // Test kaggle/creditsample-test data
  public void kaggle_credit() {
    Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz");
    ValueArray val = DKV.get(okey).get();

    // Check parsed dataset
    final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20];
    assertEquals("Number of chunks", n, val.chunks());
    assertEquals("Number of rows", 150000, val.numRows());
    assertEquals("Number of cols", 12, val.numCols());

    // setup default values for DRF
    int ntrees = 3;
    int depth = 30;
    int gini = StatType.GINI.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] =
        new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 2, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

コード例 #6
ファイル: OldModel.java プロジェクト: hihihippp/h2o
 public double score(ValueArray data, AutoBuffer ab, int row) {
   int j = 0;
   for (int i = 0; i < _xCols.length; ++i)
     _row[j++] =
         data.isNA(ab, row, _xCols[i])
             ? Double.NaN
             : (_catMap == null || _catMap[i] == null)
                 ? data.datad(ab, row, _xCols[i])
                 : translateCat(i, (int) data.data(ab, row, _xCols[i]));
   return M.score0(_row);
コード例 #7
ファイル: DParseTask.java プロジェクト: patricktoohey/h2o
  * Executes the phase one of the parser.
  * <p>First phase detects the encoding and basic statistics of the parsed dataset.
  * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on
  * per chunk basis.
  * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets.
  * @throws Exception
 public void passOne(CsvParser.Setup setup) throws Exception {
   switch (_parserType) {
     case CSV:
       // precompute the parser setup, column setup and other settings
       byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024
       if (setup == null) setup = CsvParser.guessCsvSetup(bits);
       if (setup._data == null) {
         _error = "Unable to determine the separator or number of columns on the dataset";
       _colNames = setup._data[0];
       _skipFirstLine = setup._header;
       // set the separator
       this._sep = setup._separator;
       // if parsing value array, initialize the nrows array
       if (_sourceDataset._isArray != 0) {
         ValueArray ary = ValueArray.value(_sourceDataset);
         _nrows = new int[(int) ary.chunks()];
       // launch the distributed parser on its chunks.
     case XLS:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser p = new XlsParser(this);
       --_myrows; // do not count the header
     case XLSX:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser px = new XlsxParser(this);
       throw new Error("NOT IMPLEMENTED");
   // calculate proper numbers of rows for the chunks
   if (_nrows != null) {
     _numRows = 0;
     for (int i = 0; i < _nrows.length; ++i) {
       _numRows += _nrows[i];
       _nrows[i] = _numRows;
   } else {
     _numRows = _myrows;
   // normalize mean
   for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]);
コード例 #8
ファイル: PersistNFS.java プロジェクト: 480Oswego2013/h2o
  static Value lazyArrayChunk(Key key) {
    Key arykey = ValueArray.getArrayKey(key); // From the base file key
    long off = ValueArray.getChunkOffset(key); // The offset
    long size = getFileForKey(arykey).length();
    long rem = size - off;

    // the last chunk can be fat, so it got packed into the earlier chunk
    if (rem < ValueArray.CHUNK_SZ && off > 0) return null;
    int sz = (rem >= ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem;
    Value val = new Value(key, sz, Value.NFS);
    val.setdsk(); // But its already on disk.
    return val;
コード例 #9
ファイル: UKV.java プロジェクト: patricktoohey/h2o
 // Recursively remove, gathering all the pending remote key-deletes
 private static void remove(Key key, Futures fs) {
   Value val = DKV.get(key, 32); // Get the existing Value, if any
   if (val == null) return; // Trivial delete
   if (val._isArray != 0) { // See if this is an Array
     ValueArray ary = ValueArray.value(val);
     for (long i = 0; i < ary.chunks(); i++) // Delete all the chunks
     remove(ary.getChunkKey(i), fs);
   if (key._kb[0] == Key.KEY_OF_KEYS) // Key-of-keys?
   for (Key k : val.flatten()) // Then recursively delete
     remove(k, fs);
   DKV.remove(key, fs);
コード例 #10
ファイル: RFView.java プロジェクト: jayfans3/h2o
 public static Response redirect(JsonObject fromPageResponse, Key rfModelKey) {
   RFModel rfModel = DKV.get(rfModelKey).get();
   ValueArray data = DKV.get(rfModel._dataKey).get();
   return redirect(
       data.numCols() - 1,
コード例 #11
ファイル: RandomForestTest.java プロジェクト: NidhiMehta/h2o
  /*@org.junit.Test*/ public void covtype() {
    // Key okey = loadAndParseFile("covtype.hex", "smalldata/covtype/covtype.20k.data");
    // Key okey = loadAndParseFile("covtype.hex", "../datasets/UCI/UCI-large/covtype/covtype.data");
    // Key okey = loadAndParseFile("covtype.hex", "/home/0xdiag/datasets/standard/covtype.data");
    Key okey = loadAndParseFile("mnist.hex", "smalldata/mnist/mnist8m.10k.csv.gz");
    // Key okey = loadAndParseFile("mnist.hex", "/home/0xdiag/datasets/mnist/mnist8m.csv");
    ValueArray val = UKV.get(okey);

    // setup default values for DRF
    int ntrees = 8;
    int depth = 999;
    int gini = StatType.ENTROPY.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] = new int[val.numCols()];
    for (int i = 1; i < cols.length; i++) cols[i] = i - 1;
    cols[cols.length - 1] = 0; // Class is in column 0 for mnist

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 10, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

コード例 #12
ファイル: UKV.java プロジェクト: patricktoohey/h2o
 public static void put(Key key, Value val, Futures fs) {
   Value res = DKV.put(key, val, fs);
   // If the old Value was a large array, we need to delete the leftover
   // chunks - they are unrelated to the new Value which might be either
   // bigger or smaller than the old Value.
   if (res != null && res._isArray != 0) {
     ValueArray ary = ValueArray.value(res);
     for (long i = 0; i < ary.chunks(); i++) // Delete all the chunks
     DKV.remove(ary.getChunkKey(i), fs);
   if (key._kb[0] == Key.KEY_OF_KEYS) // Key-of-keys?
   for (Key k : res.flatten()) // Then recursively delete
     remove(k, fs);
   if (res != null) res.freeMem();
コード例 #13
ファイル: PersistHdfs.java プロジェクト: pragnesh/h2o
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   final Path p;
   if (_iceRoot != null) {
     p = new Path(_iceRoot, getIceName(v));
   } else {
     // Convert an arraylet chunk into a long-offset from the base file.
     if (k._kb[0] == Key.ARRAYLET_CHUNK) {
       skip = ValueArray.getChunkOffset(k); // The offset
       k = ValueArray.getArrayKey(k); // From the base file key
       if (k.toString().endsWith(Extensions.HEX)) { // Hex file?
         int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header?
         skip += value_len;
     p = new Path(k.toString());
   final long skip_ = skip;
       new Callable() {
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
           return null;
   return b;
コード例 #14
ファイル: DParseTask.java プロジェクト: patricktoohey/h2o
  * Stores the stream to its chunk using the atomic union. After the data from the stream is
  * stored, its memory is freed up.
 public void store() {
   assert _ab.eof();
   Key k = ValueArray.getChunkKey(_chunkIndex, _resultKey);
   AtomicUnion u = new AtomicUnion(_ab.bufClose(), _chunkOffset);
   _ab = null; // free mem
コード例 #15
ファイル: TaskStore2HDFS.java プロジェクト: pwaila/h2o
  public static String store2Hdfs(Key srcKey) {
    assert srcKey._kb[0] != Key.ARRAYLET_CHUNK;
    assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name
    Value v = DKV.get(srcKey);
    if (v == null) return "Key " + srcKey + " not found";
    if (!v.isArray()) { // Simple chunk?
      v.setHdfs(); // Set to HDFS and be done
      return null; // Success

    // For ValueArrays, make the .hex header
    ValueArray ary = v.get();
    String err = PersistHdfs.freeze(srcKey, ary);
    if (err != null) return err;

    // The task managing which chunks to write next,
    // store in a known key
    TaskStore2HDFS ts = new TaskStore2HDFS(srcKey);
    Key selfKey = ts.selfKey();
    UKV.put(selfKey, ts);

    // Then start writing chunks in-order with the zero chunk
    H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node();
    RPC.call(ts.chunkHome(), ts);

    // Watch the progress key until it gets removed or an error appears
    long idx = 0;
    while ((ts = UKV.get(selfKey, TaskStore2HDFS.class)) != null) {
      if (ts._indexFrom != idx) {
        System.out.print(" " + idx + "/" + ary.chunks());
        idx = ts._indexFrom;
      if (ts._err != null) { // Found an error?
        UKV.remove(selfKey); // Cleanup & report
        return ts._err;
      try {
      } catch (InterruptedException e) {
    System.out.println(" " + ary.chunks() + "/" + ary.chunks());

    // PersistHdfs.refreshHDFSKeys();
    return null;
コード例 #16
ファイル: RBigDataTest.java プロジェクト: nadya1/h2o
 protected void testKeyValues(
     Key k, double n1, double n2, double n3, double nx3, double nx2, double nx1) {
   ValueArray v = ValueArray.value(k);
   assertEquals(v.datad(0, 0), n1, 0.0);
   assertEquals(v.datad(1, 0), n2, 0.0);
   assertEquals(v.datad(2, 0), n3, 0.0);
   assertEquals(v.datad(v.numRows() - 3, 0), nx3, 0.0);
   assertEquals(v.datad(v.numRows() - 2, 0), nx2, 0.0);
   assertEquals(v.datad(v.numRows() - 1, 0), nx1, 0.0);
コード例 #17
ファイル: KMeansModel.java プロジェクト: raghavendrabhat/h2o
 public static KMeansScore score(KMeansModel model, ValueArray ary) {
   KMeansScore kms = new KMeansScore();
   kms._arykey = ary._key;
   kms._cols = model.columnMapping(ary.colNames());
   kms._clusters = model._clusters;
   kms._normalized = model._normalized;
   return kms;
コード例 #18
ファイル: UKV.java プロジェクト: patricktoohey/h2o
 // User-Weak-Get a Key from the distributed cloud.
 // Right now, just gets chunk#0 from a ValueArray, or a normal Value otherwise.
 public static Value get(Key key) {
   Value val = DKV.get(key);
   if (val != null && val._isArray != 0) {
     Key k2 = ValueArray.getChunkKey(0, key);
     Value vchunk0 = DKV.get(k2);
     assert vchunk0 != null : "missed looking for key " + k2 + " from " + key;
     return vchunk0; // Else just get the prefix asked for
   return val;
コード例 #19
ファイル: KMeansModel.java プロジェクト: raghavendrabhat/h2o
 public void map(Key key) {
   _rows = new long[_clusters.length];
   _dist = new double[_clusters.length];
   assert key.home();
   ValueArray va = DKV.get(_arykey).get();
   AutoBuffer bits = va.getChunk(key);
   int rows = va.rpc(ValueArray.getChunkIndex(key));
   double[] values = new double[_cols.length - 1];
   ClusterDist cd = new ClusterDist();
   for (int row = 0; row < rows; row++) {
     KMeans.datad(va, bits, row, _cols, _normalized, values);
     KMeans.closest(_clusters, values, cd);
     _dist[cd._cluster] += cd._dist;
   _arykey = null;
   _cols = null;
   _clusters = null;
コード例 #20
ファイル: PersistHdfs.java プロジェクト: jayfans3/h2o
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   if (k._kb[0] == Key.ARRAYLET_CHUNK) {
     skip = ValueArray.getChunkOffset(k); // The offset
     k = ValueArray.getArrayKey(k); // From the base file key
   } else if (k._kb[0] == Key.DVEC) {
     skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
   final Path p =
       _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v));
   final long skip_ = skip;
       new Callable() {
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
           return null;
   return b;
コード例 #21
ファイル: TaskStore2HDFS.java プロジェクト: patricktoohey/h2o
  public void compute() {
    String path = null; // getPathFromValue(val);
    ValueArray ary = ValueArray.value(_arykey);
    Key self = selfKey();

    while (_indexFrom < ary.chunks()) {
      Key ckey = ary.getChunkKey(_indexFrom++);
      if (!ckey.home()) { // Next chunk not At Home?
        RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk
      Value val = DKV.get(ckey); // It IS home, so get the data
      _err = PersistHdfs.appendChunk(_arykey, val);
      if (_err != null) return;
      UKV.put(self, this); // Update the progress/self key
    // We did the last chunk.  Removing the selfKey is the signal to the web
    // thread that All Done.
コード例 #22
ファイル: OldModel.java プロジェクト: hihihippp/h2o
 // Bridge from new Model scoring to old Model scoring
 public Frame score(Frame data) {
   final double threshold = getThreshold();
   String[][] ds = _va.domains();
   if (ds[ds.length - 1] == null && !Double.isNaN(threshold)) {
     // This is a binomial classifier
     ds[ds.length - 1] = new String[] {"F", "T"};
   Model m =
       new Model(null, null, _va.colNames(), ds) {
         protected float[] score0(double data[ /*ncols*/], float preds[ /*nclasses*/]) {
           float s = (float) OldModel.this.score0(data);
           if (preds.length == 1) preds[0] = s;
           else {
             assert preds.length == 2;
             preds[0] = 1 - s;
             preds[1] = s;
           return preds;
   return m.score(data);
コード例 #23
ファイル: KMeansModel.java プロジェクト: raghavendrabhat/h2o
 private void updateClusters(
     int[] clusters, int count, long chunk, long numrows, int rpc, long updatedRow) {
   final int offset = (int) (updatedRow - (rpc * chunk));
   final Key chunkKey = ValueArray.getChunkKey(chunk, _job.dest());
   final int[] message;
   if (count == clusters.length) message = clusters;
   else {
     message = new int[count];
     System.arraycopy(clusters, 0, message, 0, message.length);
   final int rows = ValueArray.rpc(chunk, rpc, numrows);
   new Atomic() {
     public Value atomic(Value val) {
       assert val == null || val._key.equals(chunkKey);
       AutoBuffer b = new AutoBuffer(rows * ROW_SIZE);
       if (val != null) b._bb.put(val.memOrLoad());
       for (int i = 0; i < message.length; i++) b.put4((offset + i) * 4, message[i]);
       return new Value(chunkKey, b.buf());
コード例 #24
ファイル: OldModel.java プロジェクト: hihihippp/h2o
  * Adapt model for the given dataset. Default behavior is to map columns and categoricals to their
  * original indexes. Categorical values we have not seen when building the model are translated as
  * NaN.
  * <p>Override this to get custom adapt behavior (eg. handle unseen cats differently).
  * @param ary - tst dataset
  * @return OldModel - model adapted to be applied on the given data
 public OldModel adapt(ValueArray ary) {
   boolean id = true;
   final int[] colMap = columnMapping(ary.colNames());
   if (!isCompatible(colMap))
     throw new IllegalArgumentException("This model uses different columns than those provided");
   int[][] catMap = new int[colMap.length][];
   for (int i = 0; i < colMap.length - 1; ++i) {
     Column c = ary._cols[colMap[i]];
     if (c.isEnum() && !Arrays.deepEquals(_va._cols[i]._domain, c._domain)) {
       id = false;
       catMap[i] = new int[c._domain.length];
       for (int j = 0; j < c._domain.length; ++j)
         catMap[i][j] = find(c._domain[j], _va._cols[i]._domain);
   if (id && identityMap(colMap)) catMap = null;
   return new ModelDataAdaptor(
       this, colMap[colMap.length - 1], Arrays.copyOf(colMap, colMap.length - 1), catMap);
コード例 #25
ファイル: KMeansModel.java プロジェクト: raghavendrabhat/h2o
    public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) {
      final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest);
      new ValueArray(dest, 0).delete_and_lock(job.self());
      final H2OCountedCompleter fjtask =
          new H2OCountedCompleter() {
            public void compute2() {
              KMeansApply kms = new KMeansApply();
              kms._job = job;
              kms._arykey = ary._key;
              kms._cols = model.columnMapping(ary.colNames());
              kms._clusters = model._clusters;
              kms._normalized = model._normalized;

              Column c = new Column();
              c._name = Constants.RESPONSE;
              c._size = ROW_SIZE;
              c._scale = 1;
              c._min = 0;
              c._max = model._clusters.length;
              c._mean = Double.NaN;
              c._sigma = Double.NaN;
              c._domain = null;
              c._n = ary.numRows();
              ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c});

            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
              return super.onExceptionalCompletion(ex, caller);
      return job;
コード例 #26
ファイル: TaskStore2HDFS.java プロジェクト: patricktoohey/h2o
 private H2ONode chunkHome() {
   return ValueArray.getChunkKey(_indexFrom, _arykey).home_node();
コード例 #27
ファイル: KMeansModel.java プロジェクト: raghavendrabhat/h2o
  * Creates a new ValueArray with classes. New ValueArray is not aligned with source one
  * unfortunately so have to send results to each chunk owner using Atomic.
 public void map(Key key) {
   assert key.home();
   if (Job.isRunning(_job.self())) {
     ValueArray va = DKV.get(_arykey).get();
     AutoBuffer bits = va.getChunk(key);
     long startRow = va.startRow(ValueArray.getChunkIndex(key));
     int rows = va.rpc(ValueArray.getChunkIndex(key));
     int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE);
     long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE);
     long updatedChk = chunk;
     long updatedRow = startRow;
     double[] values = new double[_cols.length - 1];
     ClusterDist cd = new ClusterDist();
     int[] clusters = new int[rows];
     int count = 0;
     for (int row = 0; row < rows; row++) {
       KMeans.datad(va, bits, row, _cols, _normalized, values);
       KMeans.closest(_clusters, values, cd);
       chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE);
       if (chunk != updatedChk) {
         updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow);
         updatedChk = chunk;
         updatedRow = startRow + row;
         count = 0;
       clusters[count++] = cd._cluster;
     if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow);
   _job = null;
   _arykey = null;
   _cols = null;
   _clusters = null;
コード例 #28
ファイル: GLMRunner.java プロジェクト: NidhiMehta/h2o
  * Simple GLM wrapper to enable launching GLM from command line.
  * <p>Example input: java -jar target/h2o.jar -name=test -runMethod water.util.GLMRunner
  * -file=smalldata/logreg/prostate.csv -y=CAPSULE -family=binomial
  * @param args
  * @throws InterruptedException
 public static void main(String[] args) throws InterruptedException {
   try {
     GLMArgs ARGS = new GLMArgs();
     new Arguments(args).extract(ARGS);
     System.out.println("==================<GLMRunner START>===================");
     ValueArray ary = Utils.loadAndParseKey(ARGS.file);
     int ycol;
     try {
       ycol = Integer.parseInt(ARGS.y);
     } catch (NumberFormatException e) {
       ycol = ary.getColumnIds(new String[] {ARGS.y})[0];
     int ncols = ary.numCols();
     if (ycol < 0 || ycol >= ary.numCols()) {
       System.err.println("invalid y column: " + ycol);
     int[] xcols;
     if (ARGS.xs.equalsIgnoreCase("all")) {
       xcols = new int[ncols - 1];
       for (int i = 0; i < ycol; ++i) xcols[i] = i;
       for (int i = ycol; i < ncols - 1; ++i) xcols[i] = i + 1;
     } else {
       System.out.println("xs = " + ARGS.xs);
       String[] names = ARGS.xs.split(",");
       xcols = new int[names.length];
       try {
         for (int i = 0; i < names.length; ++i) xcols[i] = Integer.valueOf(names[i]);
       } catch (NumberFormatException e) {
         xcols = ary.getColumnIds(ARGS.xs.split(","));
     for (int x : xcols)
       if (x < 0) {
         System.err.println("Invalid predictor specification " + ARGS.xs);
     GLMJob j =
             DGLM.getData(ary, xcols, ycol, null, true),
             new ADMMSolver(ARGS.lambda, ARGS._alpha),
             new GLMParams(Family.valueOf(ARGS.family)),
     System.out.print("[GLM] computing model...");
     int progress = 0;
     while (!j.isDone()) {
       int p = (int) (100 * j.progress());
       int dots = p - progress;
       progress = p;
       for (int i = 0; i < dots; ++i) System.out.print('.');
     Log.debug(Sys.GENLM, "DONE.");
     GLMModel m = j.get();
     String[] colnames = ary.colNames();
     System.out.println("Intercept" + " = " + m._beta[ncols - 1]);
     for (int i = 0; i < xcols.length; ++i) {
       System.out.println(colnames[i] + " = " + m._beta[i]);
   } catch (Throwable t) {
   } finally { // we're done. shutdown the cloud
     Log.debug(Sys.GENLM, "==================<GLMRunner DONE>===================");
     UDPRebooted.suicide(UDPRebooted.T.shutdown, H2O.SELF);
コード例 #29
ファイル: DParseTask.java プロジェクト: patricktoohey/h2o
   * Map function for distributed parsing of the CSV files.
   * <p>In first phase it calculates the min, max, means, encodings and other statistics about the
   * dataset, determines the number of columns.
   * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal
   * sized chunks.
  public void map(Key key) {
    try {
      Key aryKey = null;
      boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK;
      boolean skipFirstLine = _skipFirstLine;
      if (arraylet) {
        aryKey = ValueArray.getArrayKey(key);
        _chunkId = ValueArray.getChunkIndex(key);
        skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0);
      switch (_phase) {
        case ONE:
          assert (_ncolumns != 0);
          // initialize the column statistics
          // perform the parse
          CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          if (arraylet) {
            long idx = ValueArray.getChunkIndex(key);
            int idx2 = (int) idx;
            assert idx2 == idx;
            assert (_nrows[idx2] == 0)
                : idx
                    + ": "
                    + Arrays.toString(_nrows)
                    + " ("
                    + _nrows[idx2]
                    + " -- "
                    + _myrows
                    + ")";
            _nrows[idx2] = _myrows;
        case TWO:
          assert (_ncolumns != 0);
          // initialize statistics - invalid rows, sigma and row size
          // calculate the first row and the number of rows to parse
          int firstRow = 0;
          int lastRow = _myrows;
          _myrows = 0;
          if (arraylet) {
            long origChunkIdx = ValueArray.getChunkIndex(key);
            firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1];
            lastRow = _nrows[(int) origChunkIdx];
          int rowsToParse = lastRow - firstRow;
          // create the output streams
          _outputStreams2 = createRecords(firstRow, rowsToParse);
          assert (_outputStreams2.length > 0);
          _ab = _outputStreams2[0].initialize();
          // perform the second parse pass
          CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          // store the last stream if not stored during the parse
          if (_ab != null) _outputStreams2[_outputIdx].store();
          assert (false);

      ParseStatus.update(_resultKey, DKV.get(key).length(), _phase);
    } catch (Exception e) {
      _error = e.getMessage();
コード例 #30
ファイル: RBigDataTest.java プロジェクト: nadya1/h2o
 public void testDataFrameStructure(Key k, int rows, int cols) {
   ValueArray v = ValueArray.value(k);
   assertEquals(v.numRows(), rows);
   assertEquals(v.numCols(), cols);