Ejemplo n.º 1
0
  private static void addFolder2(
      FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) {
    try {
      if (fs == null) return;

      Futures futures = new Futures();
      for (FileStatus file : fs.listStatus(p)) {
        Path pfs = file.getPath();
        if (file.isDir()) {
          addFolder2(fs, pfs, keys, failed);
        } else {
          long size = file.getLen();
          Key res;
          if (pfs.getName().endsWith(Extensions.JSON)) {
            throw H2O.unimpl();
          } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
            throw H2O.unimpl();
          } else {
            Key k = null;
            keys.add((k = HdfsFileVec.make(file, futures)).toString());
            Log.info("PersistHdfs: DKV.put(" + k + ")");
          }
        }
      }
    } catch (Exception e) {
      Log.err(e);
      failed.add(p.toString());
    }
  }
Ejemplo n.º 2
0
 private static void ignoreAndWait(final Exception e, boolean printException) {
   H2O.ignore(e, "Hit HDFS reset problem, retrying...", printException);
   try {
     Thread.sleep(500);
   } catch (InterruptedException ie) {
   }
 }
Ejemplo n.º 3
0
 /**
  * On-the-fly version for varimp. After generation a new tree, its tree votes are collected on
  * shuffled OOB rows and variable importance is recomputed.
  *
  * <p>The <a
  * href="http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp">page</a> says:
  * <cite> "In every tree grown in the forest, put down the oob cases and count the number of votes
  * cast for the correct class. Now randomly permute the values of variable m in the oob cases and
  * put these cases down the tree. Subtract the number of votes for the correct class in the
  * variable-m-permuted oob data from the number of votes for the correct class in the untouched
  * oob data. The average of this number over all trees in the forest is the raw importance score
  * for variable m." </cite>
  */
 @Override
 protected VarImp doVarImpCalc(
     final DRFModel model, DTree[] ktrees, final int tid, final Frame fTrain, boolean scale) {
   // Check if we have already serialized 'ktrees'-trees in the model
   assert model.ntrees() - 1 == tid
       : "Cannot compute DRF varimp since 'ktrees' are not serialized in the model! tid=" + tid;
   assert _treeMeasuresOnOOB.npredictors() - 1 == tid
       : "Tree votes over OOB rows for this tree (var ktrees) were not found!";
   // Compute tree votes over shuffled data
   final CompressedTree[ /*nclass*/] theTree =
       model.ctree(tid); // get the last tree FIXME we should pass only keys
   final int nclasses = model.nclasses();
   Futures fs = new Futures();
   for (int var = 0; var < _ncols; var++) {
     final int variable = var;
     H2OCountedCompleter task4var =
         classification
             ? new H2OCountedCompleter() {
               @Override
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeVotes cd =
                     TreeMeasuresCollector.collectVotes(
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
                 asVotes(_treeMeasuresOnSOOB[variable]).append(cd);
                 tryComplete();
               }
             }
             : /* regression */ new H2OCountedCompleter() {
               @Override
               public void compute2() {
                 // Compute this tree votes over all data over given variable
                 TreeSSE cd =
                     TreeMeasuresCollector.collectSSE(
                         theTree, nclasses, fTrain, _ncols, sample_rate, variable);
                 assert cd.npredictors() == 1;
                 asSSE(_treeMeasuresOnSOOB[variable]).append(cd);
                 tryComplete();
               }
             };
     H2O.submitTask(task4var); // Fork computation
     fs.add(task4var);
   }
   fs.blockForPending(); // Wait for results
   // Compute varimp for individual features (_ncols)
   final float[] varimp = new float[_ncols]; // output variable importance
   final float[] varimpSD = new float[_ncols]; // output variable importance sd
   for (int var = 0; var < _ncols; var++) {
     double[ /*2*/] imp =
         classification
             ? asVotes(_treeMeasuresOnSOOB[var]).imp(asVotes(_treeMeasuresOnOOB))
             : asSSE(_treeMeasuresOnSOOB[var]).imp(asSSE(_treeMeasuresOnOOB));
     varimp[var] = (float) imp[0];
     varimpSD[var] = (float) imp[1];
   }
   return new VarImp.VarImpMDA(varimp, varimpSD, model.ntrees());
 }
Ejemplo n.º 4
0
  // --------------------------------------------------------------------------
  // Build an entire layer of all K trees
  protected DHistogram[][][] buildLayer(
      final Frame fr,
      final int nbins,
      int nbins_cats,
      final DTree ktrees[],
      final int leafs[],
      final DHistogram hcs[][][],
      boolean subset,
      boolean build_tree_one_node) {
    // Build K trees, one per class.

    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass];
    Vec vecs[] = fr.vecs();
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      // Build a frame with just a single tree (& work & nid) columns, so the
      // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try
      // to close other tree's Vecs when run in parallel.
      Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
      fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]);
      fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]);
      fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]);
      if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]);
      // Start building one of the K trees in parallel
      H2O.submitTask(
          sb1ts[k] =
              new ScoreBuildOneTree(
                  this,
                  k,
                  nbins,
                  nbins_cats,
                  tree,
                  leafs,
                  hcs,
                  fr2,
                  subset,
                  build_tree_one_node,
                  _improvPerVar,
                  _model._parms._distribution));
    }
    // Block for all K trees to complete.
    boolean did_split = false;
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      sb1ts[k].join();
      if (sb1ts[k]._did_split) did_split = true;
    }
    // The layer is done.
    return did_split ? hcs : null;
  }
Ejemplo n.º 5
0
  // KMeans++ re-clustering
  private static double[][] recluster(
      double[][] points, Random rand, int N, Initialization init, String[][] isCats) {
    double[][] res = new double[N][];
    res[0] = points[0];
    int count = 1;
    ClusterDist cd = new ClusterDist();
    switch (init) {
      case Random:
        break;
      case PlusPlus:
        { // k-means++
          while (count < res.length) {
            double sum = 0;
            for (double[] point1 : points) sum += minSqr(res, point1, isCats, cd, count);

            for (double[] point : points) {
              if (minSqr(res, point, isCats, cd, count) >= rand.nextDouble() * sum) {
                res[count++] = point;
                break;
              }
            }
          }
          break;
        }
      case Furthest:
        { // Takes cluster center further from any already chosen ones
          while (count < res.length) {
            double max = 0;
            int index = 0;
            for (int i = 0; i < points.length; i++) {
              double sqr = minSqr(res, points[i], isCats, cd, count);
              if (sqr > max) {
                max = sqr;
                index = i;
              }
            }
            res[count++] = points[index];
          }
          break;
        }
      default:
        throw H2O.fail();
    }
    return res;
  }
Ejemplo n.º 6
0
    public static Job run(final Key dest, final KMeansModel model, final ValueArray ary) {
      final ChunkProgressJob job = new ChunkProgressJob(ary.chunks(), dest);
      new ValueArray(dest, 0).delete_and_lock(job.self());
      final H2OCountedCompleter fjtask =
          new H2OCountedCompleter() {
            @Override
            public void compute2() {
              KMeansApply kms = new KMeansApply();
              kms._job = job;
              kms._arykey = ary._key;
              kms._cols = model.columnMapping(ary.colNames());
              kms._clusters = model._clusters;
              kms._normalized = model._normalized;
              kms.invoke(ary._key);

              Column c = new Column();
              c._name = Constants.RESPONSE;
              c._size = ROW_SIZE;
              c._scale = 1;
              c._min = 0;
              c._max = model._clusters.length;
              c._mean = Double.NaN;
              c._sigma = Double.NaN;
              c._domain = null;
              c._n = ary.numRows();
              ValueArray res = new ValueArray(dest, ary.numRows(), c._size, new Column[] {c});
              res.unlock(job.self());
              job.remove();
              tryComplete();
            }

            @Override
            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
              job.onException(ex);
              return super.onExceptionalCompletion(ex, caller);
            }
          };
      job.start(fjtask);
      H2O.submitTask(fjtask);
      return job;
    }
Ejemplo n.º 7
0
 private void cancel(final String msg, JobState resultingState) {
   if (resultingState == JobState.CANCELLED) {
     Log.info("Job " + self() + "(" + description + ") was cancelled.");
   } else {
     Log.err("Job " + self() + "(" + description + ") failed.");
     Log.err(msg);
   }
   exception = msg;
   state = resultingState;
   // replace finished job by a job handle
   replaceByJobHandle();
   DKV.write_barrier();
   final Job job = this;
   H2O.submitTask(
       new H2OCountedCompleter() {
         @Override
         public void compute2() {
           job.onCancelled();
         }
       });
 }
Ejemplo n.º 8
0
 /**
  * Forks computation of this job.
  *
  * <p>The call does not block.
  *
  * @return always returns this job.
  */
 public Job fork() {
   init();
   H2OCountedCompleter task =
       new H2OCountedCompleter() {
         @Override
         public void compute2() {
           try {
             try {
               // Exec always waits till the end of computation
               Job.this.exec();
               Job.this.remove();
             } catch (Throwable t) {
               if (!(t instanceof ExpectedExceptionForDebug)) Log.err(t);
               Job.this.cancel(t);
             }
           } finally {
             tryComplete();
           }
         }
       };
   start(task);
   H2O.submitTask(task);
   return this;
 }
Ejemplo n.º 9
0
 /** Single row scoring, on a compatible ValueArray (when pushed throw the mapping) */
 @Override
 protected double score0(ValueArray data, int row) {
   throw H2O.unimpl();
 }
Ejemplo n.º 10
0
 /** Bulk scoring API, on a compatible ValueArray (when pushed throw the mapping) */
 @Override
 protected double score0(ValueArray data, AutoBuffer ab, int row_in_chunk) {
   throw H2O.unimpl();
 }
Ejemplo n.º 11
0
 @Override
 protected double[] score0(double[] data, double[] preds) {
   throw H2O.unimpl();
 }
Ejemplo n.º 12
0
 protected Response serve_debug() {
   throw H2O.unimpl();
 }
Ejemplo n.º 13
0
 public ModelBuilderSchema schema() {
   H2O.unimpl();
   return null;
   //  return new CoxPHV2();
 }
Ejemplo n.º 14
0
 /**
  * Cross-Validate this Job (to be overridden for each instance, which also calls
  * genericCrossValidation)
  *
  * @param splits Frames containing train/test splits
  * @param cv_preds Store the predictions for each cross-validation run
  * @param offsets Array to store the offsets of starting row indices for each cross-validation
  *     run
  * @param i Which fold of cross-validation to perform
  */
 public void crossValidate(Frame[] splits, Frame[] cv_preds, long[] offsets, int i) {
   throw H2O.unimpl();
 }