Esempio n. 1
0
 @Override
 protected void execImpl() {
   Vec va = null, vp = null, avp = null;
   try {
     if (classification) {
       // Create a new vectors - it is cheap since vector are only adaptation vectors
       va = vactual.toEnum(); // always returns TransfVec
       actual_domain = va._domain;
       vp = vpredict.toEnum(); // always returns TransfVec
       predicted_domain = vp._domain;
       if (!Arrays.equals(actual_domain, predicted_domain)) {
         domain = Utils.domainUnion(actual_domain, predicted_domain);
         int[][] vamap = Model.getDomainMapping(domain, actual_domain, true);
         va = TransfVec.compose((TransfVec) va, vamap, domain, false); // delete original va
         int[][] vpmap = Model.getDomainMapping(domain, predicted_domain, true);
         vp = TransfVec.compose((TransfVec) vp, vpmap, domain, false); // delete original vp
       } else domain = actual_domain;
       // The vectors are from different groups => align them, but properly delete it after
       // computation
       if (!va.group().equals(vp.group())) {
         avp = vp;
         vp = va.align(vp);
       }
       cm = new CM(domain.length).doAll(va, vp)._cm;
     } else {
       mse = new CM(1).doAll(vactual, vpredict).mse();
     }
     return;
   } finally { // Delete adaptation vectors
     if (va != null) UKV.remove(va._key);
     if (vp != null) UKV.remove(vp._key);
     if (avp != null) UKV.remove(avp._key);
   }
 }
Esempio n. 2
0
  // Adapt a trained model to a test dataset with different enums
  /*@Test*/ public void testModelAdapt() {
    File file1 = TestUtil.find_test_file("./smalldata/kaggle/KDDTrain.arff.gz");
    Key fkey1 = NFSFileVec.make(file1);
    Key dest1 = Key.make("KDDTrain.hex");
    File file2 = TestUtil.find_test_file("./smalldata/kaggle/KDDTest.arff.gz");
    Key fkey2 = NFSFileVec.make(file2);
    Key dest2 = Key.make("KDDTest.hex");
    GBM gbm = null;
    Frame fr = null;
    try {
      gbm = new GBM();
      gbm.source = ParseDataset2.parse(dest1, new Key[] {fkey1});
      UKV.remove(fkey1);
      gbm.response = gbm.source.remove(41); // Response is col 41
      gbm.ntrees = 2;
      gbm.max_depth = 8;
      gbm.learn_rate = 0.2f;
      gbm.min_rows = 10;
      gbm.nbins = 50;
      gbm.invoke();

      // The test data set has a few more enums than the train
      Frame ftest = ParseDataset2.parse(dest2, new Key[] {fkey2});
      Frame preds = gbm.score(ftest);

    } finally {
      UKV.remove(dest1); // Remove original hex frame key
      if (gbm != null) {
        UKV.remove(gbm.dest()); // Remove the model
        UKV.remove(gbm.response._key);
        gbm.remove(); // Remove GBM Job
        if (fr != null) fr.remove();
      }
    }
  }
Esempio n. 3
0
 /** Returns a list of all jobs in a system.
  * @return list of all jobs including running, done, cancelled, crashed jobs.
  */
 public static Job[] all() {
   List list = UKV.get(LIST);
   Job[] jobs = new Job[list==null?0:list._jobs.length];
   int j=0;
   for( int i=0; i<jobs.length; i++ ) {
     Job job = UKV.get(list._jobs[i]);
     if( job != null ) jobs[j++] = job;
   }
   if( j<jobs.length ) jobs = Arrays.copyOf(jobs,j);
   return jobs;
 }
Esempio n. 4
0
 @Test
 public void testFullVectAssignment() {
   Key k = loadAndParseKey("cars.hex", "smalldata/cars.csv");
   Key k2 = executeExpression("cars.hex");
   testDataFrameStructure(k2, 406, 8);
   UKV.remove(k2);
   k2 = executeExpression("a5 = cars.hex[2]");
   testVectorExpression("a5", 8, 8, 8, 4, 6, 6);
   UKV.remove(k2);
   UKV.remove(k);
   UKV.remove(Key.make("a5"));
 }
Esempio n. 5
0
  // Test kaggle/creditsample-test data
  @org.junit.Test
  public void kaggle_credit() {
    Key okey = loadAndParseFile("credit.hex", "smalldata/kaggle/creditsample-training.csv.gz");
    UKV.remove(Key.make("smalldata/kaggle/creditsample-training.csv.gz_UNZIPPED"));
    UKV.remove(Key.make("smalldata\\kaggle\\creditsample-training.csv.gz_UNZIPPED"));
    ValueArray val = DKV.get(okey).get();

    // Check parsed dataset
    final int n = new int[] {4, 2, 1}[ValueArray.LOG_CHK - 20];
    assertEquals("Number of chunks", n, val.chunks());
    assertEquals("Number of rows", 150000, val.numRows());
    assertEquals("Number of cols", 12, val.numCols());

    // setup default values for DRF
    int ntrees = 3;
    int depth = 30;
    int gini = StatType.GINI.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] =
        new int[] {0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 1}; // ignore column 6, classify column 1

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
        hex.rf.DRF.execute(
            modelKey,
            cols,
            val,
            ntrees,
            depth,
            1024,
            statType,
            seed,
            true,
            null,
            -1,
            Sampling.Strategy.RANDOM,
            1.0f,
            null,
            0,
            0,
            false);
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 2, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

    model.deleteKeys();
    UKV.remove(modelKey);
    UKV.remove(okey);
  }
Esempio n. 6
0
 @Test
 public void testColumnSelectors() {
   Key k = loadAndParseKey("cars.hex", "smalldata/cars.csv");
   Key k2 = executeExpression("cars.hex[2]");
   testDataFrameStructure(k2, 406, 1);
   testKeyValues(k2, 8, 8, 8, 4, 6, 6);
   UKV.remove(k2);
   k2 = executeExpression("cars.hex$year");
   testDataFrameStructure(k2, 406, 1);
   testKeyValues(k2, 73, 70, 72, 76, 78, 81);
   UKV.remove(k2);
   UKV.remove(k);
 }
Esempio n. 7
0
  /*@org.junit.Test*/ public void covtype() {
    // Key okey = loadAndParseFile("covtype.hex", "smalldata/covtype/covtype.20k.data");
    // Key okey = loadAndParseFile("covtype.hex", "../datasets/UCI/UCI-large/covtype/covtype.data");
    // Key okey = loadAndParseFile("covtype.hex", "/home/0xdiag/datasets/standard/covtype.data");
    Key okey = loadAndParseFile("mnist.hex", "smalldata/mnist/mnist8m.10k.csv.gz");
    // Key okey = loadAndParseFile("mnist.hex", "/home/0xdiag/datasets/mnist/mnist8m.csv");
    ValueArray val = UKV.get(okey);

    // setup default values for DRF
    int ntrees = 8;
    int depth = 999;
    int gini = StatType.ENTROPY.ordinal();
    int seed = 42;
    StatType statType = StatType.values()[gini];
    final int cols[] = new int[val.numCols()];
    for (int i = 1; i < cols.length; i++) cols[i] = i - 1;
    cols[cols.length - 1] = 0; // Class is in column 0 for mnist

    // Start the distributed Random Forest
    final Key modelKey = Key.make("model");
    DRFJob result =
        hex.rf.DRF.execute(
            modelKey,
            cols,
            val,
            ntrees,
            depth,
            1024,
            statType,
            seed,
            true,
            null,
            -1,
            Sampling.Strategy.RANDOM,
            1.0f,
            null,
            0,
            0,
            false);
    // Wait for completion on all nodes
    RFModel model = result.get();

    assertEquals("Number of classes", 10, model.classes());
    assertEquals("Number of trees", ntrees, model.size());

    model.deleteKeys();
    UKV.remove(modelKey);
    UKV.remove(okey);
  }
Esempio n. 8
0
 /**
  * Creates the value header based on the calculated columns.
  *
  * <p>Also stores the header to its appropriate key. This will be the VA header of the parsed
  * dataset.
  */
 private void createValueArrayHeader() {
   assert (_phase == Pass.TWO);
   Column[] cols = new Column[_ncolumns];
   int off = 0;
   for (int i = 0; i < cols.length; ++i) {
     cols[i] = new Column();
     cols[i]._n = _numRows - _invalidValues[i];
     cols[i]._base = _bases[i];
     assert (char) pow10i(-_scale[i]) == pow10i(-_scale[i])
         : "scale out of bounds!, col = " + i + ", scale = " + _scale[i];
     cols[i]._scale = (char) pow10i(-_scale[i]);
     cols[i]._off = (char) off;
     cols[i]._size = (byte) COL_SIZES[_colTypes[i]];
     cols[i]._domain = _colDomains[i];
     cols[i]._max = _max[i];
     cols[i]._min = _min[i];
     cols[i]._mean = _mean[i];
     cols[i]._sigma = _sigma[i];
     cols[i]._name = _colNames[i];
     off += Math.abs(cols[i]._size);
   }
   // let any pending progress reports finish
   DKV.write_barrier();
   // finally make the value array header
   ValueArray ary = new ValueArray(_resultKey, _numRows, off, cols);
   UKV.put(_resultKey, ary.value());
 }
Esempio n. 9
0
 @Test
 public void testDifferentSizeOps() {
   Key cars = loadAndParseKey("cars.hex", "smalldata/cars.csv");
   Key poker = loadAndParseKey("p.hex", "smalldata/poker/poker-hand-testing.data");
   testVectorExpression("cars.hex$year + p.hex[1]", 74, 82, 81, 84, 86, 81);
   testVectorExpression("cars.hex$year - p.hex[1]", 72, 58, 63, 62, 64, 71);
   testVectorExpression("cars.hex$year * p.hex[1]", 73, 840, 648, 803, 825, 380);
   // testVectorExpression("cars.hex$year / p.hex[1]", 73, 70/12, 8, 76/11, 78/11, 15.2); // hard
   // to get the numbers right + not needed no new coverage
   testVectorExpression("p.hex[1] + cars.hex$year", 74, 82, 81, 84, 86, 81);
   testVectorExpression("p.hex[1] - cars.hex$year", -72, -58, -63, -62, -64, -71);
   testVectorExpression("p.hex[1] * cars.hex$year", 73, 840, 648, 803, 825, 380);
   // testVectorExpression("p.hex[1] / cars.hex$year", 1/73, 12/70, 0.125, 11/76, 11/78, 5/81);
   UKV.remove(poker);
   UKV.remove(cars);
 }
Esempio n. 10
0
 // Write-lock & delete 'k'.  Will fail if 'k' is locked by anybody other than 'job_key'
 public static void delete(Key k, Key job_key) {
   if (k == null) return;
   Value val = DKV.get(k);
   if (val == null) return; // Or just nothing there to delete
   if (!val.isLockable()) UKV.remove(k); // Simple things being deleted
   else ((Lockable) val.get()).delete(job_key, 0.0f); // Lockable being deleted
 }
Esempio n. 11
0
  public void set(Argument arg, String input, Object value) {
    if (arg._field.getType() != Key.class && value instanceof Key) value = UKV.get((Key) value);

    try {
      if (arg._field.getType() == Key.class && value instanceof ValueArray)
        value = ((ValueArray) value)._key;
      //
      else if (arg._field.getType() == int.class && value instanceof Long)
        value = ((Long) value).intValue();
      //
      else if (arg._field.getType() == float.class && value instanceof Double)
        value = ((Double) value).floatValue();
      //
      else if (arg._field.getType() == Frame.class && value instanceof ValueArray)
        value = ((ValueArray) value).asFrame(input);
      //
      else if (value instanceof NumberSequence) {
        double[] ds = ((NumberSequence) value)._arr;
        if (arg._field.getType() == int[].class) {
          int[] is = new int[ds.length];
          for (int i = 0; i < is.length; i++) is[i] = (int) ds[i];
          value = is;
        } else value = ds;
      }
      arg._field.set(this, value);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
Esempio n. 12
0
 /** Actually remove/delete all Vecs from memory, not just from the Frame. */
 public void remove(Futures fs) {
   if (vecs().length > 0) {
     for (Vec v : _vecs) UKV.remove(v._key, fs);
   }
   _names = new String[0];
   _vecs = new Vec[0];
   _keys = new Key[0];
 }
Esempio n. 13
0
 protected void testScalarExpression(String expr, double result) {
   Key key = executeExpression(expr);
   ValueArray va = ValueArray.value(key);
   assertEquals(va.numRows(), 1);
   assertEquals(va.numCols(), 1);
   assertEquals(result, va.datad(0, 0), 0.0);
   UKV.remove(key);
 }
Esempio n. 14
0
 public static ValueArray loadAndParseKey(Key okey, String path) {
   FileIntegrityChecker c = FileIntegrityChecker.check(new File(path),false);
   Key k = c.syncDirectory(null,null,null,null);
   ParseDataset.forkParseDataset(okey, new Key[] { k }, null).get();
   UKV.remove(k);
   ValueArray res = DKV.get(okey).get();
   return res;
 }
Esempio n. 15
0
 @Override
 public boolean toHTML(StringBuilder sb) {
   Job jjob = Job.findJob(job_key);
   DRFModel m = UKV.get(jjob.dest());
   if (m != null) m.generateHTML("DRF Model", sb);
   else DocGen.HTML.paragraph(sb, "Pending...");
   return true;
 }
Esempio n. 16
0
  public static String store2Hdfs(Key srcKey) {
    assert srcKey._kb[0] != Key.ARRAYLET_CHUNK;
    assert PersistHdfs.getPathForKey(srcKey) != null; // Validate key name
    Value v = DKV.get(srcKey);
    if (v == null) return "Key " + srcKey + " not found";
    if (v._isArray == 0) { // Simple chunk?
      v.setHdfs(); // Set to HDFS and be done
      return null; // Success
    }

    // For ValueArrays, make the .hex header
    ValueArray ary = ValueArray.value(v);
    String err = PersistHdfs.freeze(srcKey, ary);
    if (err != null) return err;

    // The task managing which chunks to write next,
    // store in a known key
    TaskStore2HDFS ts = new TaskStore2HDFS(srcKey);
    Key selfKey = ts.selfKey();
    UKV.put(selfKey, ts);

    // Then start writing chunks in-order with the zero chunk
    H2ONode chk0_home = ValueArray.getChunkKey(0, srcKey).home_node();
    RPC.call(ts.chunkHome(), ts);

    // Watch the progress key until it gets removed or an error appears
    long idx = 0;
    while (UKV.get(selfKey, ts) != null) {
      if (ts._indexFrom != idx) {
        System.out.print(" " + idx + "/" + ary.chunks());
        idx = ts._indexFrom;
      }
      if (ts._err != null) { // Found an error?
        UKV.remove(selfKey); // Cleanup & report
        return ts._err;
      }
      try {
        Thread.sleep(100);
      } catch (InterruptedException e) {
      }
    }
    System.out.println(" " + ary.chunks() + "/" + ary.chunks());

    // PersistHdfs.refreshHDFSKeys();
    return null;
  }
Esempio n. 17
0
 private DRFModel runDRF(Frame data, PrepData dprep) {
   DRF drf = new DRF();
   drf.source = data;
   drf.response = dprep.prep(data);
   drf.ntrees = 1;
   drf.invoke();
   return UKV.get(drf.dest());
 }
Esempio n. 18
0
 @Override
 protected Response serve() {
   Response response = super.serve();
   if (destination_key != null) {
     GridSearch grid = UKV.get(destination_key);
     if (grid != null) jobs = grid.jobs;
   }
   return response;
 }
Esempio n. 19
0
 @Test
 public void testLargeDataOps() {
   Key poker = loadAndParseKey("p.hex", "smalldata/poker/poker-hand-testing.data");
   testVectorExpression("p.hex[1] + p.hex[2]", 2, 15, 13, 15, 12, 7);
   testVectorExpression("p.hex[1] - p.hex[2]", 0, 9, 5, 7, 10, 3);
   testVectorExpression("p.hex[1] * p.hex[2]", 1, 36, 36, 44, 11, 10);
   testVectorExpression("p.hex[1] / p.hex[2]", 1.0, 4.0, 2.25, 2.75, 11.0, 2.5);
   UKV.remove(poker);
 }
Esempio n. 20
0
 @Test
 public void testVectorOperators() {
   Key k = loadAndParseKey("cars.hex", "smalldata/cars.csv");
   testVectorExpression("cars.hex[2] + cars.hex$year", 81, 78, 80, 80, 84, 87);
   testVectorExpression("cars.hex[2] - cars.hex$year", -65, -62, -64, -72, -72, -75);
   testVectorExpression("cars.hex[2] * cars.hex$year", 584, 560, 576, 304, 468, 486);
   testVectorExpression("cars.hex$year / cars.hex[2]", 9.125, 8.75, 9.0, 19.0, 13.0, 13.5);
   UKV.remove(k);
 }
Esempio n. 21
0
 static final int findResponseIdx(RFModel model) {
   String nresponse = model.responseName();
   ValueArray ary = UKV.get(model._dataKey);
   int idx = 0;
   for (ValueArray.Column cols : ary._cols)
     if (nresponse.equals(cols._name)) return idx;
     else idx++;
   return -1;
 }
Esempio n. 22
0
 /** Actually remove/delete all Vecs from memory, not just from the Frame. */
 public void remove(Futures fs) {
   if (_vecs.length > 0) {
     VectorGroup vg = _vecs[0].group();
     for (Vec v : _vecs) UKV.remove(v._key, fs);
     DKV.remove(vg._key);
   }
   _names = new String[0];
   _vecs = new Vec[0];
 }
Esempio n. 23
0
  // ==========================================================================
  public void basicGBM(String fname, String hexname, PrepData prep) {
    File file = TestUtil.find_test_file(fname);
    if (file == null) return; // Silently abort test if the file is missing
    Key fkey = NFSFileVec.make(file);
    Key dest = Key.make(hexname);
    GBM gbm = null;
    Frame fr = null;
    try {
      gbm = new GBM();
      gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey});
      UKV.remove(fkey);
      int idx = prep.prep(fr);
      if (idx < 0) {
        gbm.classification = false;
        idx = ~idx;
      }
      String rname = fr._names[idx];
      gbm.response = fr.vecs()[idx];
      fr.remove(idx); // Move response to the end
      fr.add(rname, gbm.response);
      gbm.ntrees = 4;
      gbm.max_depth = 4;
      gbm.min_rows = 1;
      gbm.nbins = 50;
      gbm.cols = new int[fr.numCols()];
      for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i;
      gbm.learn_rate = .2f;
      gbm.invoke();

      fr = gbm.score(gbm.source);

      GBM.GBMModel gbmmodel = UKV.get(gbm.dest());
      // System.out.println(gbmmodel.toJava());

    } finally {
      UKV.remove(dest); // Remove original hex frame key
      if (gbm != null) {
        UKV.remove(gbm.dest()); // Remove the model
        UKV.remove(gbm.response._key);
        gbm.remove(); // Remove GBM Job
        if (fr != null) fr.remove();
      }
    }
  }
Esempio n. 24
0
 public void onException(Throwable ex) {
   UKV.remove(dest());
   Value v = DKV.get(progressKey());
   if( v != null ) {
     ChunkProgress p = v.get();
     p = p.error(ex.getMessage());
     DKV.put(progressKey(), p);
   }
   cancel(ex);
 }
Esempio n. 25
0
File: Job.java Progetto: pwaila/h2o
 // Block until the Job finishes.
 // NOT F/J FRIENDLY, EATS THE THREAD until job completes.  Only use for web threads.
 public <T> T get() {
   // TODO through notifications?
   while (DKV.get(_self) != null) {
     try {
       Thread.sleep(10);
     } catch (InterruptedException e) {
       throw new RuntimeException(e);
     }
   }
   return (T) UKV.get(_dest);
 }
Esempio n. 26
0
 public Key importFile(int i, Futures fs) {
   if (_ok[i] < H2O.CLOUD.size()) return null;
   File f = new File(_files[i]);
   Key k;
   if (_newApi) {
     k = PersistNFS.decodeFile(f);
     NFSFileVec nfs = DKV.get(NFSFileVec.make(f, fs)).get();
     UKV.put(k, new Frame(new String[] {"0"}, new Vec[] {nfs}), fs);
   } else {
     k = PersistNFS.decodeFile(f);
     long size = f.length();
     Value val =
         (size < 2 * ValueArray.CHUNK_SZ)
             ? new Value(k, (int) size, Value.NFS)
             : new Value(k, new ValueArray(k, size), Value.NFS);
     val.setdsk();
     UKV.put(k, val, fs);
   }
   return k;
 }
Esempio n. 27
0
  @Override
  public void compute() {
    String path = null; // getPathFromValue(val);
    ValueArray ary = ValueArray.value(_arykey);
    Key self = selfKey();

    while (_indexFrom < ary.chunks()) {
      Key ckey = ary.getChunkKey(_indexFrom++);
      if (!ckey.home()) { // Next chunk not At Home?
        RPC.call(chunkHome(), this); // Hand the baton off to the next node/chunk
        return;
      }
      Value val = DKV.get(ckey); // It IS home, so get the data
      _err = PersistHdfs.appendChunk(_arykey, val);
      if (_err != null) return;
      UKV.put(self, this); // Update the progress/self key
    }
    // We did the last chunk.  Removing the selfKey is the signal to the web
    // thread that All Done.
    UKV.remove(self);
  }
Esempio n. 28
0
 protected String[] getVectorDomain(final Vec v) {
   assert v==null || v.isInt() || v.isEnum() : "Cannot get vector domain!";
   if (v==null) return null;
   String[] r = null;
   if (v.isEnum()) {
     r = v.domain();
   } else {
     Vec tmp = v.toEnum();
     r = tmp.domain();
     UKV.remove(tmp._key);
   }
   return r;
 }
Esempio n. 29
0
 @Test
 public void testBigLargeExpression() {
   Key poker = loadAndParseKey("p.hex", "smalldata/poker/poker-hand-testing.data");
   testVectorExpression(
       "p.hex[1] / p.hex[2] + p.hex[3] * p.hex[1] - p.hex[5] + (2* p.hex[1] - (p.hex[2]+3))",
       8,
       35,
       63.25,
       85.75,
       116.0,
       43.5);
   UKV.remove(poker);
 }
Esempio n. 30
0
  @Override
  protected void execImpl() {
    Vec va = null, vp;
    try {
      va = vactual.toEnum(); // always returns TransfVec
      vp = vpredict;
      // The vectors are from different groups => align them, but properly delete it after
      // computation
      if (!va.group().equals(vp.group())) {
        vp = va.align(vp);
      }
      // compute thresholds, if not user-given
      if (thresholds != null) {
        sort(thresholds);
        if (Utils.minValue(thresholds) < 0)
          throw new IllegalArgumentException("Minimum threshold cannot be negative.");
        if (Utils.maxValue(thresholds) > 1)
          throw new IllegalArgumentException("Maximum threshold cannot be greater than 1.");
      } else {
        HashSet hs = new HashSet();
        final int bins = (int) Math.min(vpredict.length(), 200l);
        final long stride = Math.max(vpredict.length() / bins, 1);
        for (int i = 0; i < bins; ++i)
          hs.add(
              new Float(
                  vpredict.at(i * stride))); // data-driven thresholds TODO: use percentiles (from
        // Summary2?)
        for (int i = 0; i < 51; ++i)
          hs.add(new Float(i / 50.)); // always add 0.02-spaced thresholds from 0 to 1

        // created sorted vector of unique thresholds
        thresholds = new float[hs.size()];
        int i = 0;
        for (Object h : hs) {
          thresholds[i++] = (Float) h;
        }
        sort(thresholds);
      }
      // compute CMs
      aucdata =
          new AUCData()
              .compute(
                  new AUCTask(thresholds, va.mean()).doAll(va, vp).getCMs(),
                  thresholds,
                  va._domain,
                  threshold_criterion);
    } finally { // Delete adaptation vectors
      if (va != null) UKV.remove(va._key);
    }
  }