Пример #1
0
 /**
  * Creates the value header based on the calculated columns.
  *
  * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed
  * dataset.
  */
 private void createValueArrayHeader() {
   assert (_phase == Pass.TWO);
   Column[] cols = new Column[_ncolumns];
   int off = 0;
   for (int i = 0; i < cols.length; ++i) {
     cols[i] = new Column();
     cols[i]._n = _numRows - _invalidValues[i];
     cols[i]._base = _bases[i];
     assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i])
         : "scale out of bounds!, col = " + i + ", scale = " + _scale[i];
     cols[i]._scale = (char) pow10i(-_scale[i]);
     cols[i]._off = (char) off;
     cols[i]._size = (byte) COL_SIZES[_colTypes[i]];
     cols[i]._domain = _colDomains[i];
     cols[i]._max = _max[i];
     cols[i]._min = _min[i];
     cols[i]._mean = _mean[i];
     cols[i]._sigma = _sigma[i];
     cols[i]._name = _colNames[i];
     off += Math.abs(cols[i]._size);
   }
   // let any pending progress reports finish
   DKV.write_barrier();
   // finally make the value array header
   ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols);
   UKV.put(_resultKey, ary.value());
 }
Пример #2
0
 @Override
 public Value lazyArrayChunk(final Key key) {
   final Key arykey = ValueArray.getArrayKey(key); // From the base file key
   final long off = (_iceRoot != null) ? 0 : ValueArray.getChunkOffset(key); // The offset
   final Path p =
       (_iceRoot != null)
           ? new Path(_iceRoot, getIceName(key, (byte) 'V'))
           : new Path(arykey.toString());
   final Size sz = new Size();
   run(
       new Callable() {
         @Override
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           long rem = fs.getFileStatus(p).getLen() - off;
           sz._value = (rem > ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem;
           return null;
         }
       },
       true,
       0);
   Value val = new Value(key, sz._value, Value.HDFS);
   val.setdsk(); // But its already on disk.
   return val;
 }
Пример #3
0
 // Read up to 'len' bytes of Value. Value should already be persisted to
 // disk.  A racing delete can trigger a failure where we get a null return,
 // but no crash (although one could argue that a racing load&delete is a bug
 // no matter what).
 @Override
 public byte[] load(Value v) {
   long skip = 0;
   Key k = v._key;
   // Convert an arraylet chunk into a long-offset from the base file.
   if (k._kb[0] == Key.ARRAYLET_CHUNK) {
     skip = ValueArray.getChunkOffset(k); // The offset
     k = ValueArray.getArrayKey(k); // From the base file key
   }
   if (k._kb[0] == Key.DVEC) {
     skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
   }
   try {
     FileInputStream s = null;
     try {
       s = new FileInputStream(getFileForKey(k));
       FileChannel fc = s.getChannel();
       fc.position(skip);
       AutoBuffer ab = new AutoBuffer(fc, true, Value.NFS);
       byte[] b = ab.getA1(v._max);
       ab.close();
       assert v.isPersisted();
       return b;
     } finally {
       if (s != null) s.close();
     }
   } catch (IOException e) { // Broken disk / short-file???
     H2O.ignore(e);
     return null;
   }
 }
Пример #4
0
 protected void testScalarExpression(String expr, double result) {
   Key key = executeExpression(expr);
   ValueArray va = ValueArray.value(key);
   assertEquals(va.numRows(), 1);
   assertEquals(va.numCols(), 1);
   assertEquals(result, va.datad(0, 0), 0.0);
   UKV.remove(key);
 }
Пример #5
0
  // Test kaggle/creditsample-test data
  @org.junit.Test
  public void kaggle_credit() {
    Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz");
    UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED"));
    UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED"));
    ValueArray val = DKV.get(okey).get();

    // Check parsed dataset
    final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20];
    assertEquals("Number of chunks", n, val.chunks());
    assertEquals("Number of rows", 150000, val.numRows());
    assertEquals("Number of cols", 12, val.numCols());

    // setup default values for DRF
    int ntrees = 3;
    int depth = 30;
    int gini = StatType.GINI.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] =
        new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
        hex.rf.DRF.execute(
            modelKey,
            cols,
            val,
            ntrees,
            depth,
            1024,
            statType,
            seed,
            true,
            null,
            -1,
            Sampling.Strategy.RANDOM,
            1.0f,
            null,
            0,
            0,
            false);
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 2, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

    model.deleteKeys();
    UKV.remove(modelKey);
    UKV.remove(okey);
  }
Пример #6
0
 /**
  * Executes the phase one of the parser.
  *
  * <p>First phase detects the encoding and basic statistics of the parsed dataset.
  *
  * <p>For CSV parsers it detects the parser setup and then launches the distributed computation on
  * per chunk basis.
  *
  * <p>For XLS and XLSX parsers that do not work in distrubuted way parses the whole datasets.
  *
  * @throws Exception
  */
 public void passOne(CsvParser.Setup setup) throws Exception {
   switch (_parserType) {
     case CSV:
       // precompute the parser setup, column setup and other settings
       byte[] bits = _sourceDataset.getFirstBytes(); // Can limit to eg 256*1024
       if (setup == null) setup = CsvParser.guessCsvSetup(bits);
       if (setup._data == null) {
         _error = "Unable to determine the separator or number of columns on the dataset";
         return;
       }
       _colNames = setup._data[0];
       setColumnNames(_colNames);
       _skipFirstLine = setup._header;
       // set the separator
       this._sep = setup._separator;
       // if parsing value array, initialize the nrows array
       if (_sourceDataset._isArray != 0) {
         ValueArray ary = ValueArray.value(_sourceDataset);
         _nrows = new int[(int) ary.chunks()];
       }
       // launch the distributed parser on its chunks.
       this.invoke(_sourceDataset._key);
       break;
     case XLS:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser p = new XlsParser(this);
       p.parse(_sourceDataset._key);
       --_myrows; // do not count the header
       break;
     case XLSX:
       // XLS parsing is not distributed, just obtain the value stream and
       // run the parser
       CustomParser px = new XlsxParser(this);
       px.parse(_sourceDataset._key);
       break;
     default:
       throw new Error("NOT IMPLEMENTED");
   }
   // calculate proper numbers of rows for the chunks
   if (_nrows != null) {
     _numRows = 0;
     for (int i = 0; i < _nrows.length; ++i) {
       _numRows += _nrows[i];
       _nrows[i] = _numRows;
     }
   } else {
     _numRows = _myrows;
   }
   // normalize mean
   for (int i = 0; i < _ncolumns; ++i) _mean[i] = _mean[i] / (_numRows - _invalidValues[i]);
 }
Пример #7
0
  @Override
  public Value lazyArrayChunk(Key key) {
    Key arykey = ValueArray.getArrayKey(key); // From the base file key
    long off = ValueArray.getChunkOffset(key); // The offset
    long size = getFileForKey(arykey).length();
    long rem = size - off;

    // the last chunk can be fat, so it got packed into the earlier chunk
    if (rem < ValueArray.CHUNK_SZ && off > 0) return null;
    int sz = (rem >= ValueArray.CHUNK_SZ * 2) ? (int) ValueArray.CHUNK_SZ : (int) rem;
    Value val = new Value(key, sz, Value.NFS);
    val.setdsk(); // But its already on disk.
    return val;
  }
Пример #8
0
 public static Response redirect(JsonObject fromPageResponse, Key rfModelKey) {
   RFModel rfModel = DKV.get(rfModelKey).get();
   ValueArray data = DKV.get(rfModel._dataKey).get();
   return redirect(
       fromPageResponse,
       null,
       rfModelKey,
       rfModel._dataKey,
       rfModel._totalTrees,
       data.numCols() - 1,
       null,
       true,
       false);
 }
Пример #9
0
  /*@org.junit.Test*/ public void covtype() {
    // Key okey = loadAndParseFile("covtype.hex", "smalldata/covtype/covtype.20k.data");
    // Key okey = loadAndParseFile("covtype.hex", "../datasets/UCI/UCI-large/covtype/covtype.data");
    // Key okey = loadAndParseFile("covtype.hex", "/home/0xdiag/datasets/standard/covtype.data");
    Key okey = loadAndParseFile("mnist.hex", "smalldata/mnist/mnist8m.10k.csv.gz");
    // Key okey = loadAndParseFile("mnist.hex", "/home/0xdiag/datasets/mnist/mnist8m.csv");
    ValueArray val = UKV.get(okey);

    // setup default values for DRF
    int ntrees = 8;
    int depth = 999;
    int gini = StatType.ENTROPY.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] = new int[val.numCols()];
    for (int i = 1; i < cols.length; i++) cols[i] = i - 1;
    cols[cols.length - 1] = 0; // Class is in column 0 for mnist

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
        hex.rf.DRF.execute(
            modelKey,
            cols,
            val,
            ntrees,
            depth,
            1024,
            statType,
            seed,
            true,
            null,
            -1,
            Sampling.Strategy.RANDOM,
            1.0f,
            null,
            0,
            0,
            false);
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 10, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

    model.deleteKeys();
    UKV.remove(modelKey);
    UKV.remove(okey);
  }
Пример #10
0
 @Override
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   final Path p;
   if (_iceRoot != null) {
     p = new Path(_iceRoot, getIceName(v));
   } else {
     // Convert an arraylet chunk into a long-offset from the base file.
     if (k._kb[0] == Key.ARRAYLET_CHUNK) {
       skip = ValueArray.getChunkOffset(k); // The offset
       k = ValueArray.getArrayKey(k); // From the base file key
       if (k.toString().endsWith(Extensions.HEX)) { // Hex file?
         int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header?
         skip += value_len;
       }
     }
     p = new Path(k.toString());
   }
   final long skip_ = skip;
   run(
       new Callable() {
         @Override
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
             Utils.close(s);
           }
           return null;
         }
       },
       true,
       v._max);
   return b;
 }
Пример #11
0
 /**
  * Stores the stream to its chunk using the atomic union. After the data from the stream is
  * stored, its memory is freed up.
  */
 public void store() {
   assert _ab.eof();
   Key k = ValueArray.getChunkKey(_chunkIndex, _resultKey);
   AtomicUnion u = new AtomicUnion(_ab.bufClose(), _chunkOffset);
   alsoBlockFor(u.fork(k));
   _ab = null; // free mem
 }
Пример #12
0
 @Override
 protected Response serve() {
   InputStream s = null;
   String urlStr = _url.value();
   try {
     // if( urlStr.startsWith("file://") ) {
     // urlStr = urlStr.substring("file://".length());
     if (urlStr.startsWith("file:///")) {
       urlStr = urlStr.substring("file:///".length());
       File f = new File(urlStr);
       // urlStr = "file://"+f.getCanonicalPath();
       urlStr = "file:///" + f.getCanonicalPath();
     }
     URL url = new URL(urlStr);
     Key k = _key.value();
     if (k == null) k = Key.make(urlStr);
     s = url.openStream();
     if (s == null) return Response.error("Unable to open stream to URL " + url.toString());
     ValueArray.readPut(k, s);
     JsonObject json = new JsonObject();
     json.addProperty(KEY, k.toString());
     json.addProperty(URL, urlStr);
     Response r = Response.done(json);
     r.setBuilder(KEY, new KeyElementBuilder());
     return r;
   } catch (IllegalArgumentException e) {
     return Response.error("Not a valid key: " + urlStr);
   } catch (IOException e) {
     return Response.error(e);
   } finally {
     Closeables.closeQuietly(s);
   }
 }
Пример #13
0
  public static String store2Hdfs(Key srcKey) {
    assert srcKey._kb[0] != Key.ARRAYLET_CHUNK;
    assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name
    Value v = DKV.get(srcKey);
    if (v == null) return "Key " + srcKey + " not found";
    if (!v.isArray()) { // Simple chunk?
      v.setHdfs(); // Set to HDFS and be done
      return null; // Success
    }

    // For ValueArrays, make the .hex header
    ValueArray ary = v.get();
    String err = PersistHdfs.freeze(srcKey, ary);
    if (err != null) return err;

    // The task managing which chunks to write next,
    // store in a known key
    TaskStore2HDFS ts = new TaskStore2HDFS(srcKey);
    Key selfKey = ts.selfKey();
    UKV.put(selfKey, ts);

    // Then start writing chunks in-order with the zero chunk
    H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node();
    RPC.call(ts.chunkHome(), ts);

    // Watch the progress key until it gets removed or an error appears
    long idx = 0;
    while ((ts = UKV.get(selfKey, TaskStore2HDFS.class)) != null) {
      if (ts._indexFrom != idx) {
        System.out.print(" " + idx + "/" + ary.chunks());
        idx = ts._indexFrom;
      }
      if (ts._err != null) { // Found an error?
        UKV.remove(selfKey); // Cleanup & report
        return ts._err;
      }
      try {
        Thread.sleep(100);
      } catch (InterruptedException e) {
      }
    }
    System.out.println(" " + ary.chunks() + "/" + ary.chunks());

    // PersistHdfs.refreshHDFSKeys();
    return null;
  }
Пример #14
0
 protected void testKeyValues(
     Key k, double n1, double n2, double n3, double nx3, double nx2, double nx1) {
   ValueArray v = ValueArray.value(k);
   assertEquals(v.datad(0, 0), n1, 0.0);
   assertEquals(v.datad(1, 0), n2, 0.0);
   assertEquals(v.datad(2, 0), n3, 0.0);
   assertEquals(v.datad(v.numRows() - 3, 0), nx3, 0.0);
   assertEquals(v.datad(v.numRows() - 2, 0), nx2, 0.0);
   assertEquals(v.datad(v.numRows() - 1, 0), nx1, 0.0);
 }
Пример #15
0
 public static KMeansScore score(KMeansModel model, ValueArray ary) {
   KMeansScore kms = new KMeansScore();
   kms._arykey = ary._key;
   kms._cols = model.columnMapping(ary.colNames());
   kms._clusters = model._clusters;
   kms._normalized = model._normalized;
   kms.invoke(ary._key);
   return kms;
 }
Пример #16
0
 @Override
 public void map(Key key) {
   _rows = new long[_clusters.length];
   _dist = new double[_clusters.length];
   assert key.home();
   ValueArray va = DKV.get(_arykey).get();
   AutoBuffer bits = va.getChunk(key);
   int rows = va.rpc(ValueArray.getChunkIndex(key));
   double[] values = new double[_cols.length - 1];
   ClusterDist cd = new ClusterDist();
   for (int row = 0; row < rows; row++) {
     KMeans.datad(va, bits, row, _cols, _normalized, values);
     KMeans.closest(_clusters, values, cd);
     _rows[cd._cluster]++;
     _dist[cd._cluster] += cd._dist;
   }
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
Пример #17
0
 @Override
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   if (k._kb[0] == Key.ARRAYLET_CHUNK) {
     skip = ValueArray.getChunkOffset(k); // The offset
     k = ValueArray.getArrayKey(k); // From the base file key
   } else if (k._kb[0] == Key.DVEC) {
     skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
   }
   final Path p =
       _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v));
   final long skip_ = skip;
   run(
       new Callable() {
         @Override
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
             Utils.close(s);
           }
           return null;
         }
       },
       true,
       v._max);
   return b;
 }
Пример #18
0
  @Override
  public void compute() {
    String path = null; // getPathFromValue(val);
    ValueArray ary = ValueArray.value(_arykey);
    Key self = selfKey();

    while (_indexFrom < ary.chunks()) {
      Key ckey = ary.getChunkKey(_indexFrom++);
      if (!ckey.home()) { // Next chunk not At Home?
        RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk
        return;
      }
      Value val = DKV.get(ckey); // It IS home, so get the data
      _err = PersistHdfs.appendChunk(_arykey, val);
      if (_err != null) return;
      UKV.put(self, this); // Update the progress/self key
    }
    // We did the last chunk.  Removing the selfKey is the signal to the web
    // thread that All Done.
    UKV.remove(self);
  }
Пример #19
0
 private void updateClusters(
     int[] clusters, int count, long chunk, long numrows, int rpc, long updatedRow) {
   final int offset = (int) (updatedRow - (rpc * chunk));
   final Key chunkKey = ValueArray.getChunkKey(chunk, _job.dest());
   final int[] message;
   if (count == clusters.length) message = clusters;
   else {
     message = new int[count];
     System.arraycopy(clusters, 0, message, 0, message.length);
   }
   final int rows = ValueArray.rpc(chunk, rpc, numrows);
   new Atomic() {
     @Override
     public Value atomic(Value val) {
       assert val == null || val._key.equals(chunkKey);
       AutoBuffer b = new AutoBuffer(rows * ROW_SIZE);
       if (val != null) b._bb.put(val.memOrLoad());
       for (int i = 0; i < message.length; i++) b.put4((offset + i) * 4, message[i]);
       b.position(b.limit());
       return new Value(chunkKey, b.buf());
     }
   }.invoke(chunkKey);
 }
Пример #20
0
    public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) {
      final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest);
      new ValueArray(dest, 0).delete_and_lock(job.self());
      final H2OCountedCompleter fjtask =
          new H2OCountedCompleter() {
            @Override
            public void compute2() {
              KMeansApply kms = new KMeansApply();
              kms._job = job;
              kms._arykey = ary._key;
              kms._cols = model.columnMapping(ary.colNames());
              kms._clusters = model._clusters;
              kms._normalized = model._normalized;
              kms.invoke(ary._key);

              Column c = new Column();
              c._name = Constants.RESPONSE;
              c._size = ROW_SIZE;
              c._scale = 1;
              c._min = 0;
              c._max = model._clusters.length;
              c._mean = Double.NaN;
              c._sigma = Double.NaN;
              c._domain = null;
              c._n = ary.numRows();
              ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c});
              res.unlock(job.self());
              job.remove();
              tryComplete();
            }

            @Override
            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
              job.onException(ex);
              return super.onExceptionalCompletion(ex, caller);
            }
          };
      job.start(fjtask);
      H2O.submitTask(fjtask);
      return job;
    }
Пример #21
0
 private H2ONode chunkHome() {
   return ValueArray.getChunkKey(_indexFrom, _arykey).home_node();
 }
Пример #22
0
 /**
  * Creates a new ValueArray with classes. New ValueArray is not aligned with source one
  * unfortunately so have to send results to each chunk owner using Atomic.
  */
 @Override
 public void map(Key key) {
   assert key.home();
   if (Job.isRunning(_job.self())) {
     ValueArray va = DKV.get(_arykey).get();
     AutoBuffer bits = va.getChunk(key);
     long startRow = va.startRow(ValueArray.getChunkIndex(key));
     int rows = va.rpc(ValueArray.getChunkIndex(key));
     int rpc = (int) (ValueArray.CHUNK_SZ / ROW_SIZE);
     long chunk = ValueArray.chknum(startRow, va.numRows(), ROW_SIZE);
     long updatedChk = chunk;
     long updatedRow = startRow;
     double[] values = new double[_cols.length - 1];
     ClusterDist cd = new ClusterDist();
     int[] clusters = new int[rows];
     int count = 0;
     for (int row = 0; row < rows; row++) {
       KMeans.datad(va, bits, row, _cols, _normalized, values);
       KMeans.closest(_clusters, values, cd);
       chunk = ValueArray.chknum(startRow + row, va.numRows(), ROW_SIZE);
       if (chunk != updatedChk) {
         updateClusters(clusters, count, updatedChk, va.numRows(), rpc, updatedRow);
         updatedChk = chunk;
         updatedRow = startRow + row;
         count = 0;
       }
       clusters[count++] = cd._cluster;
     }
     if (count > 0) updateClusters(clusters, count, chunk, va.numRows(), rpc, updatedRow);
     _job.updateProgress(1);
   }
   _job = null;
   _arykey = null;
   _cols = null;
   _clusters = null;
 }
Пример #23
0
 public void testDataFrameStructure(Key k, int rows, int cols) {
   ValueArray v = ValueArray.value(k);
   assertEquals(v.numRows(), rows);
   assertEquals(v.numCols(), cols);
 }
Пример #24
0
 /**
  * Simple GLM wrapper to enable launching GLM from command line.
  *
  * <p>Example input: java -jar target/h2o.jar -name=test -runMethod water.util.GLMRunner
  * -file=smalldata/logreg/prostate.csv -y=CAPSULE -family=binomial
  *
  * @param args
  * @throws InterruptedException
  */
 public static void main(String[] args) throws InterruptedException {
   try {
     GLMArgs ARGS = new GLMArgs();
     new Arguments(args).extract(ARGS);
     System.out.println("==================<GLMRunner START>===================");
     ValueArray ary = Utils.loadAndParseKey(ARGS.file);
     int ycol;
     try {
       ycol = Integer.parseInt(ARGS.y);
     } catch (NumberFormatException e) {
       ycol = ary.getColumnIds(new String[] {ARGS.y})[0];
     }
     int ncols = ary.numCols();
     if (ycol < 0 || ycol >= ary.numCols()) {
       System.err.println("invalid y column: " + ycol);
       H2O.exit(-1);
     }
     int[] xcols;
     if (ARGS.xs.equalsIgnoreCase("all")) {
       xcols = new int[ncols - 1];
       for (int i = 0; i < ycol; ++i) xcols[i] = i;
       for (int i = ycol; i < ncols - 1; ++i) xcols[i] = i + 1;
     } else {
       System.out.println("xs = " + ARGS.xs);
       String[] names = ARGS.xs.split(",");
       xcols = new int[names.length];
       try {
         for (int i = 0; i < names.length; ++i) xcols[i] = Integer.valueOf(names[i]);
       } catch (NumberFormatException e) {
         xcols = ary.getColumnIds(ARGS.xs.split(","));
       }
     }
     for (int x : xcols)
       if (x < 0) {
         System.err.println("Invalid predictor specification " + ARGS.xs);
         H2O.exit(-1);
       }
     GLMJob j =
         DGLM.startGLMJob(
             DGLM.getData(ary, xcols, ycol, null, true),
             new ADMMSolver(ARGS.lambda, ARGS._alpha),
             new GLMParams(Family.valueOf(ARGS.family)),
             null,
             ARGS.xval,
             true);
     System.out.print("[GLM] computing model...");
     int progress = 0;
     while (!j.isDone()) {
       int p = (int) (100 * j.progress());
       int dots = p - progress;
       progress = p;
       for (int i = 0; i < dots; ++i) System.out.print('.');
       Thread.sleep(250);
     }
     Log.debug(Sys.GENLM, "DONE.");
     GLMModel m = j.get();
     String[] colnames = ary.colNames();
     System.out.println("Intercept" + " = " + m._beta[ncols - 1]);
     for (int i = 0; i < xcols.length; ++i) {
       System.out.println(colnames[i] + " = " + m._beta[i]);
     }
   } catch (Throwable t) {
     Log.err(t);
   } finally { // we're done. shutdown the cloud
     Log.debug(Sys.GENLM, "==================<GLMRunner DONE>===================");
     UDPRebooted.suicide(UDPRebooted.T.shutdown, H2O.SELF);
   }
 }
Пример #25
0
  /**
   * Map function for distributed parsing of the CSV files.
   *
   * <p>In first phase it calculates the min, max, means, encodings and other statistics about the
   * dataset, determines the number of columns.
   *
   * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal
   * sized chunks.
   */
  @Override
  public void map(Key key) {
    try {
      Key aryKey = null;
      boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK;
      boolean skipFirstLine = _skipFirstLine;
      if (arraylet) {
        aryKey = ValueArray.getArrayKey(key);
        _chunkId = ValueArray.getChunkIndex(key);
        skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0);
      }
      switch (_phase) {
        case ONE:
          assert (_ncolumns != 0);
          // initialize the column statistics
          phaseOneInitialize();
          // perform the parse
          CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p.parse(key);
          if (arraylet) {
            long idx = ValueArray.getChunkIndex(key);
            int idx2 = (int) idx;
            assert idx2 == idx;
            assert (_nrows[idx2] == 0)
                : idx
                    + ": "
                    + Arrays.toString(_nrows)
                    + " ("
                    + _nrows[idx2]
                    + " -- "
                    + _myrows
                    + ")";
            _nrows[idx2] = _myrows;
          }
          break;
        case TWO:
          assert (_ncolumns != 0);
          // initialize statistics - invalid rows, sigma and row size
          phaseTwoInitialize();
          // calculate the first row and the number of rows to parse
          int firstRow = 0;
          int lastRow = _myrows;
          _myrows = 0;
          if (arraylet) {
            long origChunkIdx = ValueArray.getChunkIndex(key);
            firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1];
            lastRow = _nrows[(int) origChunkIdx];
          }
          int rowsToParse = lastRow - firstRow;
          // create the output streams
          _outputStreams2 = createRecords(firstRow, rowsToParse);
          assert (_outputStreams2.length > 0);
          _ab = _outputStreams2[0].initialize();
          // perform the second parse pass
          CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p2.parse(key);
          // store the last stream if not stored during the parse
          if (_ab != null) _outputStreams2[_outputIdx].store();
          break;
        default:
          assert (false);
      }

      ParseStatus.update(_resultKey, DKV.get(key).length(), _phase);
    } catch (Exception e) {
      e.printStackTrace();
      _error = e.getMessage();
    }
  }