Пример #1
0
  /**
   * Delete v1, v2 after potential modifying operations during processing: enums and/or train/test
   * adaptation.
   */
  private void simpleCMTest(
      Frame v1,
      Frame v2,
      String[] actualDomain,
      String[] predictedDomain,
      String[] expectedDomain,
      double[][] expectedCM,
      boolean debug,
      boolean toEnum) {
    Scope.enter();
    try {
      ConfusionMatrix cm = buildCM(v1.vecs()[0].toEnum(), v2.vecs()[0].toEnum());

      // -- DEBUG --
      if (debug) {
        System.err.println("actual            : " + Arrays.toString(actualDomain));
        System.err.println("predicted         : " + Arrays.toString(predictedDomain));
        System.err.println("CM domain         : " + Arrays.toString(cm._domain));
        System.err.println("expected CM domain: " + Arrays.toString(expectedDomain) + "\n");
        for (int i = 0; i < cm._cm.length; i++) System.err.println(Arrays.toString(cm._cm[i]));
        System.err.println("");
        System.err.println(cm.toASCII());
      }
      // -- -- --
      assertCMEqual(expectedDomain, expectedCM, cm);
    } finally {
      if (v1 != null) v1.delete();
      if (v2 != null) v2.delete();
      Scope.exit();
    }
  }
Пример #2
0
  DataAdapter(
      Frame fr,
      SpeeDRFModel model,
      int[] modelDataMap,
      int rows,
      long unique,
      long seed,
      int binLimit,
      double[] classWt) {
    //    assert model._dataKey == fr._key;
    _seed = seed + (unique << 16); // This is important to preserve sampling selection!!!
    /* Maximum arity for a column (not a hard limit) */
    _numRows = rows;
    _numClasses = model.regression ? 1 : model.classes();
    _regression = model.regression;

    _c = new Col[model.fr.numCols()];
    for (int i = 0; i < _c.length; i++) {
      assert fr._names[modelDataMap[i]].equals(model.fr._names[i]);
      Vec v = fr.vecs()[i];
      if (isByteCol(v, rows, i == _c.length - 1, _regression)) // we do not bin for small values
      _c[i] = new Col(fr._names[i], rows, i == _c.length - 1);
      else
        _c[i] =
            new Col(fr._names[i], rows, i == _c.length - 1, binLimit, !(v.isEnum() || v.isInt()));
    }
    boolean trivial = true;
    if (classWt != null) for (double f : classWt) if (f != 1.0) trivial = false;
    _classWt = trivial ? null : classWt;
  }
Пример #3
0
 @Override
 public void compute2() {
   _in.read_lock(_jobKey);
   // simply create a bogus new vector (don't even put it into KV) with appropriate number of lines
   // per chunk and then use it as a source to do multiple makeZero calls
   // to create empty vecs and than call RebalanceTask on each one of them.
   // RebalanceTask will fetch the appropriate src chunks and fetch the data from them.
   int rpc = (int) (_in.numRows() / _nchunks);
   int rem = (int) (_in.numRows() % _nchunks);
   long[] espc = new long[_nchunks + 1];
   Arrays.fill(espc, rpc);
   for (int i = 0; i < rem; ++i) ++espc[i];
   long sum = 0;
   for (int i = 0; i < espc.length; ++i) {
     long s = espc[i];
     espc[i] = sum;
     sum += s;
   }
   assert espc[espc.length - 1] == _in.numRows()
       : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1];
   final Vec[] srcVecs = _in.vecs();
   _out =
       new Frame(
           _okey,
           _in.names(),
           new Vec(Vec.newKey(), espc).makeZeros(srcVecs.length, _in.domains()));
   _out.delete_and_lock(_jobKey);
   new RebalanceTask(this, srcVecs).asyncExec(_out);
 }
Пример #4
0
  @Override
  protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) {
    Frame adaptFrm = new Frame(adaptedFr);
    for (int i = 0; i < _parms._k; i++)
      adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero());

    new MRTask() {
      @Override
      public void map(Chunk chks[]) {
        double tmp[] = new double[_output._names.length];
        double preds[] = new double[_parms._k];
        for (int row = 0; row < chks[0]._len; row++) {
          double p[] = score0(chks, row, tmp, preds);
          for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]);
        }
      }
    }.doAll(adaptFrm);

    // Return the projection into principal component space
    int x = _output._names.length, y = adaptFrm.numCols();
    Frame f =
        adaptFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f =
        new Frame(
            (null == destination_key ? Key.make() : Key.make(destination_key)),
            f.names(),
            f.vecs());
    DKV.put(f);
    makeMetricBuilder(null).makeModelMetrics(this, orig);
    return f;
  }
Пример #5
0
 @Override
 public void map(Chunk[] ix, NewChunk[] ncs) {
   final Vec[] vecs = new Vec[_cols.length];
   final Vec anyv = _base.anyVec();
   final long nrow = anyv.length();
   long r = ix[0].at80(0);
   int last_ci = anyv.elem2ChunkIdx(r < nrow ? r : 0); // memoize the last chunk index
   long last_c0 = anyv._espc[last_ci]; // ...         last chunk start
   long last_c1 = anyv._espc[last_ci + 1]; // ...         last chunk end
   Chunk[] last_cs = new Chunk[vecs.length]; // ...         last chunks
   for (int c = 0; c < _cols.length; c++) {
     vecs[c] = _base.vecs()[_cols[c]];
     last_cs[c] = vecs[c].elem2BV(last_ci);
   }
   for (int i = 0; i < ix[0]._len; i++) {
     // select one row
     r = ix[0].at80(i) - 1; // next row to select
     if (r < 0) continue;
     if (r >= nrow) {
       for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN);
     } else {
       if (r < last_c0 || r >= last_c1) {
         last_ci = anyv.elem2ChunkIdx(r);
         last_c0 = anyv._espc[last_ci];
         last_c1 = anyv._espc[last_ci + 1];
         for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].elem2BV(last_ci);
       }
       for (int c = 0; c < vecs.length; c++) ncs[c].addNum(last_cs[c].at(r));
     }
   }
 }
Пример #6
0
 static Frame exec_str(String str, String id) {
   Val val = Exec.exec(str);
   switch (val.type()) {
     case Val.FRM:
       Frame fr = val.getFrame();
       Key k = Key.make(id);
       // Smart delete any prior top-level result
       Iced i = DKV.getGet(k);
       if (i instanceof Lockable) ((Lockable) i).delete();
       else if (i instanceof Keyed) ((Keyed) i).remove();
       else if (i != null)
         throw new IllegalArgumentException("Attempting to overright an unexpected key");
       DKV.put(fr = new Frame(k, fr._names, fr.vecs()));
       System.out.println(fr);
       checkSaneFrame();
       return fr;
     case Val.NUM:
       System.out.println("num= " + val.getNum());
       assert id == null;
       checkSaneFrame();
       return null;
     case Val.STR:
       System.out.println("str= " + val.getStr());
       assert id == null;
       checkSaneFrame();
       return null;
     default:
       throw water.H2O.fail();
   }
 }
Пример #7
0
 DRFTree(Frame fr, int ncols, char nbins, char nclass, int min_rows, int mtrys, long seed) {
   super(fr._names, ncols, nbins, nclass, min_rows, seed);
   _mtrys = mtrys;
   _rand = createRNG(seed);
   _seeds = new long[fr.vecs()[0].nChunks()];
   for (int i = 0; i < _seeds.length; i++) _seeds[i] = _rand.nextLong();
 }
Пример #8
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Val v = stk.track(asts[1].exec(env));
    if (v instanceof ValRow) {
      ValRow vv = (ValRow) v;
      return vv.slice(asts[2].columns(vv._names));
    }
    Frame fr = v.getFrame();
    int[] cols = asts[2].columns(fr.names());

    Frame fr2 = new Frame();
    if (cols.length == 0) { // Empty inclusion list?
    } else if (cols[0] >= 0) { // Positive (inclusion) list
      if (cols[cols.length - 1] > fr.numCols())
        throw new IllegalArgumentException(
            "Column must be an integer from 0 to " + (fr.numCols() - 1));
      for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]);
    } else { // Negative (exclusion) list
      fr2 = new Frame(fr); // All of them at first
      Arrays.sort(cols); // This loop depends on the values in sorted order
      for (int col : cols)
        if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column
    }

    return new ValFrame(fr2);
  }
Пример #9
0
    @Override
    protected void init() {
      if (validation != null && n_folds != 0)
        throw new UnsupportedOperationException(
            "Cannot specify a validation dataset and non-zero number of cross-validation folds.");
      if (n_folds < 0)
        throw new UnsupportedOperationException(
            "The number of cross-validation folds must be >= 0.");
      super.init();
      xval_models = new Key[n_folds];
      for (int i = 0; i < xval_models.length; ++i)
        xval_models[i] = Key.make(dest().toString() + "_xval" + i);

      int rIndex = 0;
      for (int i = 0; i < source.vecs().length; i++)
        if (source.vecs()[i] == response) {
          rIndex = i;
          break;
        }
      _responseName = source._names != null && rIndex >= 0 ? source._names[rIndex] : "response";

      _train = selectVecs(source);
      _names = new String[cols.length];
      for (int i = 0; i < cols.length; i++) _names[i] = source._names[cols[i]];

      // Compute source response domain
      if (classification) _sourceResponseDomain = getVectorDomain(response);
      // Is validation specified?
      if (validation != null) {
        // Extract a validation response
        int idx = validation.find(source.names()[rIndex]);
        if (idx == -1)
          throw new IllegalArgumentException(
              "Validation set does not have a response column called " + _responseName);
        _validResponse = validation.vecs()[idx];
        // Compute output confusion matrix domain for classification:
        // - if validation dataset is specified then CM domain is union of train and validation
        // response domains
        //   else it is only domain of response column.
        if (classification) {
          _validResponseDomain = getVectorDomain(_validResponse);
          if (_validResponseDomain != null) {
            _cmDomain = Utils.domainUnion(_sourceResponseDomain, _validResponseDomain);
            if (!Arrays.deepEquals(_sourceResponseDomain, _validResponseDomain)) {
              _fromModel2CM =
                  Model.getDomainMapping(
                      _cmDomain,
                      _sourceResponseDomain,
                      false); // transformation from model produced response ~> cmDomain
              _fromValid2CM =
                  Model.getDomainMapping(
                      _cmDomain,
                      _validResponseDomain,
                      false); // transformation from validation response domain ~> cmDomain
            }
          } else _cmDomain = _sourceResponseDomain;
        } /* end of if classification */
      } else if (classification) _cmDomain = _sourceResponseDomain;
    }
Пример #10
0
  @Override
  public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
    Frame f = stk.track(asts[1].exec(env)).getFrame();
    AstRoot axisAR = asts[2];
    for (Vec v : f.vecs()) {
      if (v.isCategorical() || v.isString() || v.isUUID())
        throw new IllegalArgumentException(
            "Cumulative functions not applicable to enum, string, or UUID values");
    }
    double axis = axisAR.exec(env).getNum();
    if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1");
    if (f.numCols() == 1) {
      if (axis == 0.0) {
        AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init());
        t.doAll(new byte[] {Vec.T_NUM}, f.anyVec());
        final double[] chkCumu = t._chkCumu;
        Vec cumuVec = t.outputFrame().anyVec();
        new MRTask() {
          @Override
          public void map(Chunk c) {
            if (c.cidx() != 0) {
              double d = chkCumu[c.cidx() - 1];
              for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d));
            }
          }
        }.doAll(cumuVec);
        return new ValFrame(new Frame(cumuVec));
      } else {
        return new ValFrame(new Frame(f));
      }
    } else {

      if (axis == 0.0) { // down the column implementation

        AstCumu.CumuTaskWholeFrame t =
            new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        final double[][] chkCumu = t._chkCumu;
        new MRTask() {
          @Override
          public void map(Chunk cs[]) {
            if (cs[0].cidx() != 0) {
              for (int i = 0; i < cs.length; i++) {
                double d = chkCumu[i][cs[i].cidx() - 1];
                for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d));
              }
            }
          }
        }.doAll(fr2);
        return new ValFrame(new Frame(fr2));

      } else {
        AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        return new ValFrame(new Frame(fr2));
      }
    }
  }
Пример #11
0
 protected final Frame selectFrame(Frame frame) {
   Vec[] vecs = new Vec[cols.length];
   String[] names = new String[cols.length];
   for( int i = 0; i < cols.length; i++ ) {
     vecs[i] = frame.vecs()[cols[i]];
     names[i] = frame.names()[cols[i]];
   }
   return new Frame(names, vecs);
 }
Пример #12
0
 /**
  * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one +
  * one-to-all)
  *
  * @param fr Input frame
  * @param seed RNG seed
  * @param shuffle whether to shuffle the data globally
  * @return Shuffled frame
  */
 public static Frame shuffleAndBalance(
     final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) {
   if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) {
     Vec[] vecs = fr.vecs().clone();
     Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks.");
     long[] idx = null;
     if (shuffle) {
       idx = new long[splits];
       for (int r = 0; r < idx.length; ++r) idx[r] = r;
       Utils.shuffleArray(idx, seed);
     }
     Key keys[] = new Vec.VectorGroup().addVecs(vecs.length);
     final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits));
     // loop over cols (same indexing for each column)
     Futures fs = new Futures();
     for (int col = 0; col < vecs.length; col++) {
       AppendableVec vec = new AppendableVec(keys[col]);
       // create outgoing chunks for this col
       NewChunk[] outCkg = new NewChunk[splits];
       for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i);
       // loop over all incoming chunks
       for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) {
         final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg);
         // loop over local rows of incoming chunks (fast path)
         for (int row = 0; row < inCkg._len; ++row) {
           int outCkgIdx =
               (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx
           if (shuffle)
             outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk
           assert (outCkgIdx >= 0 && outCkgIdx < splits);
           outCkg[outCkgIdx].addNum(inCkg.at0(row));
         }
       }
       for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs);
       Vec t = vec.close(fs);
       t._domain = vecs[col]._domain;
       vecs[col] = t;
     }
     fs.blockForPending();
     Log.info("Load balancing done.");
     return new Frame(fr.names(), vecs);
   }
   return fr;
 }
Пример #13
0
  @Override
  public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);
    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);
    final int nsplits = templates.length;
    assert nsplits == ratios.length + 1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s = 0; s < nsplits; s++) {
      Frame split = new Frame(destKeys[s], dataset.names(), templates[s]);
      split.delete_and_lock(jobKey);
      splits[s] = split;
    }
    setPendingCount(1);
    H2O.submitTask(
        new H2OCountedCompleter(FrameSplitter.this) {
          @Override
          public void compute2() {
            setPendingCount(nsplits);
            for (int s = 0; s < nsplits; s++) {
              new FrameSplitTask(
                      new H2OCountedCompleter(this) { // Completer for this task
                        @Override
                        public void compute2() {}

                        @Override
                        public boolean onExceptionalCompletion(
                            Throwable ex, CountedCompleter caller) {
                          synchronized (
                              FrameSplitter
                                  .this) { // synchronized on this since can be accessed from
                            // different workers
                            workersExceptions =
                                workersExceptions != null
                                    ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1)
                                    : new Throwable[1];
                            workersExceptions[workersExceptions.length - 1] = ex;
                          }
                          tryComplete(); // we handle the exception so wait perform normal
                          // completion
                          return false;
                        }
                      },
                      datasetVecs,
                      ratios,
                      s)
                  .asyncExec(splits[s]);
            }
            tryComplete(); // complete the computation of nsplits-tasks
          }
        });
    tryComplete(); // complete the computation of thrown tasks
  }
Пример #14
0
  // --------------------------------------------------------------------------
  // Build an entire layer of all K trees
  protected DHistogram[][][] buildLayer(
      final Frame fr,
      final int nbins,
      int nbins_cats,
      final DTree ktrees[],
      final int leafs[],
      final DHistogram hcs[][][],
      boolean subset,
      boolean build_tree_one_node) {
    // Build K trees, one per class.

    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass];
    Vec vecs[] = fr.vecs();
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      // Build a frame with just a single tree (& work & nid) columns, so the
      // nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try
      // to close other tree's Vecs when run in parallel.
      Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
      fr2.add(fr._names[idx_tree(k)], vecs[idx_tree(k)]);
      fr2.add(fr._names[idx_work(k)], vecs[idx_work(k)]);
      fr2.add(fr._names[idx_nids(k)], vecs[idx_nids(k)]);
      if (idx_weight() >= 0) fr2.add(fr._names[idx_weight()], vecs[idx_weight()]);
      // Start building one of the K trees in parallel
      H2O.submitTask(
          sb1ts[k] =
              new ScoreBuildOneTree(
                  this,
                  k,
                  nbins,
                  nbins_cats,
                  tree,
                  leafs,
                  hcs,
                  fr2,
                  subset,
                  build_tree_one_node,
                  _improvPerVar,
                  _model._parms._distribution));
    }
    // Block for all K trees to complete.
    boolean did_split = false;
    for (int k = 0; k < _nclass; k++) {
      final DTree tree = ktrees[k]; // Tree for class K
      if (tree == null) continue;
      sb1ts[k].join();
      if (sb1ts[k]._did_split) did_split = true;
    }
    // The layer is done.
    return did_split ? hcs : null;
  }
Пример #15
0
 @Override
 ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) {
   Frame fr = stk.track(asts[1].exec(env)).getFrame();
   double frac = asts[2].exec(env).getNum();
   double nrow = fr.numRows() * frac;
   Vec vecs[] = fr.vecs();
   long[] idxs = new long[fr.numCols()];
   int j = 0;
   for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i;
   Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec());
   return new ValFrame(new Frame(vec));
 }
Пример #16
0
 public final String[] coefNames() {
   int k = 0;
   final int n = fullN();
   String[] res = new String[n];
   final Vec[] vecs = _adaptedFrame.vecs();
   for (int i = 0; i < _cats; ++i)
     for (int j = 1; j < vecs[i]._domain.length; ++j)
       res[k++] = _adaptedFrame._names[i] + "." + vecs[i]._domain[j];
   final int nums = n - k;
   for (int i = 0; i < nums; ++i) res[k + i] = _adaptedFrame._names[_cats + i];
   return res;
 }
Пример #17
0
  // Scalar covariance for 1 row
  private ValNum scalar(Frame frx, Frame fry, Mode mode) {
    if (frx.numCols() != fry.numCols())
      throw new IllegalArgumentException(
          "Single rows must have the same number of columns, found "
              + frx.numCols()
              + " and "
              + fry.numCols());
    Vec vecxs[] = frx.vecs();
    Vec vecys[] = fry.vecs();
    double xmean = 0, ymean = 0, ncols = frx.numCols(), NACount = 0, xval, yval, ss = 0;
    for (int r = 0; r < ncols; r++) {
      xval = vecxs[r].at(0);
      yval = vecys[r].at(0);
      if (Double.isNaN(xval) || Double.isNaN(yval)) NACount++;
      else {
        xmean += xval;
        ymean += yval;
      }
    }
    xmean /= (ncols - NACount);
    ymean /= (ncols - NACount);

    if (NACount != 0) {
      if (mode.equals(Mode.AllObs))
        throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
      if (mode.equals(Mode.Everything)) return new ValNum(Double.NaN);
    }

    for (int r = 0; r < ncols; r++) {
      xval = vecxs[r].at(0);
      yval = vecys[r].at(0);
      if (!(Double.isNaN(xval) || Double.isNaN(yval)))
        ss += (vecxs[r].at(0) - xmean) * (vecys[r].at(0) - ymean);
    }
    return new ValNum(ss / (ncols - NACount - 1));
  }
Пример #18
0
  @Test
  public void test() {
    Frame frame = null;
    try {
      Futures fs = new Futures();
      Random random = new Random();
      Vec[] vecs = new Vec[1];
      AppendableVec vec = new AppendableVec(Vec.newKey(), Vec.T_NUM);
      for (int i = 0; i < 2; i++) {
        NewChunk chunk = new NewChunk(vec, i);
        for (int r = 0; r < 1000; r++) chunk.addNum(random.nextInt(1000));
        chunk.close(i, fs);
      }
      vecs[0] = vec.layout_and_close(fs);
      fs.blockForPending();
      frame = new Frame(Key.<Frame>make(), null, vecs);

      // Make sure we test the multi-chunk case
      vecs = frame.vecs();
      assert vecs[0].nChunks() > 1;
      long rows = frame.numRows();
      Vec v = vecs[0];
      double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY, mean = 0, sigma = 0;
      for (int r = 0; r < rows; r++) {
        double d = v.at(r);
        if (d < min) min = d;
        if (d > max) max = d;
        mean += d;
      }
      mean /= rows;
      for (int r = 0; r < rows; r++) {
        double d = v.at(r);
        sigma += (d - mean) * (d - mean);
      }
      sigma = Math.sqrt(sigma / (rows - 1));

      double epsilon = 1e-9;
      assertEquals(max, v.max(), epsilon);
      assertEquals(min, v.min(), epsilon);
      assertEquals(mean, v.mean(), epsilon);
      assertEquals(sigma, v.sigma(), epsilon);
    } finally {
      if (frame != null) frame.delete();
    }
  }
Пример #19
0
  // ==========================================================================
  public void basicGBM(String fname, String hexname, PrepData prep) {
    File file = TestUtil.find_test_file(fname);
    if (file == null) return; // Silently abort test if the file is missing
    Key fkey = NFSFileVec.make(file);
    Key dest = Key.make(hexname);
    GBM gbm = null;
    Frame fr = null;
    try {
      gbm = new GBM();
      gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey});
      UKV.remove(fkey);
      int idx = prep.prep(fr);
      if (idx < 0) {
        gbm.classification = false;
        idx = ~idx;
      }
      String rname = fr._names[idx];
      gbm.response = fr.vecs()[idx];
      fr.remove(idx); // Move response to the end
      fr.add(rname, gbm.response);
      gbm.ntrees = 4;
      gbm.max_depth = 4;
      gbm.min_rows = 1;
      gbm.nbins = 50;
      gbm.cols = new int[fr.numCols()];
      for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i;
      gbm.learn_rate = .2f;
      gbm.invoke();

      fr = gbm.score(gbm.source);

      GBM.GBMModel gbmmodel = UKV.get(gbm.dest());
      // System.out.println(gbmmodel.toJava());

    } finally {
      UKV.remove(dest); // Remove original hex frame key
      if (gbm != null) {
        UKV.remove(gbm.dest()); // Remove the model
        UKV.remove(gbm.response._key);
        gbm.remove(); // Remove GBM Job
        if (fr != null) fr.remove();
      }
    }
  }
Пример #20
0
 public DataInfo validDinfo(Frame valid) {
   DataInfo res =
       new DataInfo(
           _adaptedFrame,
           null,
           1,
           _useAllFactorLevels,
           TransformType.NONE,
           TransformType.NONE,
           _skipMissing,
           _imputeMissing,
           false,
           _weights,
           _offset,
           _fold);
   res._adaptedFrame = new Frame(_adaptedFrame.names(), valid.vecs(_adaptedFrame.names()));
   res._valid = true;
   return res;
 }
Пример #21
0
 public final String[] coefNames() {
   if (_coefNames != null) return _coefNames;
   int k = 0;
   final int n = fullN();
   String[] res = new String[n];
   final Vec[] vecs = _adaptedFrame.vecs();
   for (int i = 0; i < _cats; ++i) {
     for (int j = _useAllFactorLevels ? 0 : 1; j < vecs[i].domain().length; ++j) {
       int jj = getCategoricalId(i, j);
       if (jj < 0) continue;
       res[k++] = _adaptedFrame._names[i] + "." + vecs[i].domain()[j];
     }
     if (_catMissing[i] > 0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)";
   }
   final int nums = n - k;
   System.arraycopy(_adaptedFrame._names, _cats, res, k, nums);
   _coefNames = res;
   return res;
 }
Пример #22
0
  public final String[] coefNames() {
    if (_coefNames != null) return _coefNames; // already computed
    int k = 0;
    final int n = fullN(); // total number of columns to compute
    String[] res = new String[n];
    final Vec[] vecs = _adaptedFrame.vecs();

    // first do all of the expanded categorical names
    for (int i = 0; i < _cats; ++i) {
      for (int j = (_useAllFactorLevels || vecs[i] instanceof InteractionWrappedVec) ? 0 : 1;
          j < vecs[i].domain().length;
          ++j) {
        int jj = getCategoricalId(i, j);
        if (jj < 0) continue;
        res[k++] = _adaptedFrame._names[i] + "." + vecs[i].domain()[j];
      }
      if (_catMissing[i] && getCategoricalId(i, _catModes[i]) >= 0)
        res[k++] = _adaptedFrame._names[i] + ".missing(NA)";
    }
    // now loop over the numerical columns, collecting up any expanded InteractionVec names
    if (_interactions == null) {
      final int nums = n - k;
      System.arraycopy(_adaptedFrame._names, _cats, res, k, nums);
    } else {
      for (int i = _cats; i < _nums; ++i) {
        InteractionWrappedVec v;
        if (vecs[i] instanceof InteractionWrappedVec
            && ((v = (InteractionWrappedVec) vecs[i]).domains()
                != null)) { // in this case, get the categoricalOffset
          for (int j = 0; k < v.domains().length; ++j) {
            if (getCategoricalIdFromInteraction(i, j) < 0) continue;
            res[k++] = _adaptedFrame._names[i] + "." + v.domains()[j];
          }
        } else res[k++] = _adaptedFrame._names[i];
      }
    }
    _coefNames = res;
    return res;
  }
Пример #23
0
  public static Frame expandDataset(Frame fr, Key destkey) { // , int[] ignored) {
    ArrayList<Vec> nvecs = new ArrayList<Vec>();
    ArrayList<Vec> evecs = new ArrayList<Vec>();
    ArrayList<String> eNames = new ArrayList<String>();
    ArrayList<String> nNames = new ArrayList<String>();
    int[] offsets = new int[fr.numCols() + 1];
    Vec[] vecs = fr.vecs();
    int c = 0;
    // int ip = 0; //ignored pointer
    for (int i = 0; i < fr.numCols(); i++) {
      if (vecs[i]
          .isEnum()) { // && i != ignored[ip]) {//!fr._names {//_names[i]. { //equals(ignored)) {
        offsets[evecs.size()] = c;
        evecs.add(vecs[i]);
        String name = fr._names[i];
        c += vecs[i]._domain.length;
        for (String s : vecs[i]._domain) eNames.add(name + "." + s);
      } else {
        // if(i == ignored[ip] && ip < ignored.length - 1) ip++;
        nvecs.add(vecs[i]);
        nNames.add(fr._names[i]);
      }
    }
    offsets[evecs.size()] = c;
    if (evecs.isEmpty()) return fr;
    offsets = Arrays.copyOf(offsets, evecs.size() + 1);

    OneHot ss = new OneHot();
    ss._offsets = offsets;
    int l = offsets[evecs.size()];
    ss.doAll(l, evecs.toArray(new Vec[evecs.size()]));

    Frame fr2 = ss.outputFrame(destkey, eNames.toArray(new String[eNames.size()]), new String[l][]);
    fr2.add(
        new Frame(nNames.toArray(new String[nNames.size()]), nvecs.toArray(new Vec[nvecs.size()])),
        false);
    return fr2;
  }
Пример #24
0
 static boolean checkSaneFrame_impl() {
   for (Key k : H2O.localKeySet()) {
     Value val = H2O.raw_get(k);
     if (val.isFrame()) {
       Frame fr = val.get();
       Vec vecs[] = fr.vecs();
       for (int i = 0; i < vecs.length; i++) {
         Vec v = vecs[i];
         if (DKV.get(v._key) == null) {
           System.err.println(
               "Frame "
                   + fr._key
                   + " in the DKV, is missing Vec "
                   + v._key
                   + ", name="
                   + fr._names[i]);
           return false;
         }
       }
     }
   }
   return true;
 }
Пример #25
0
    @Override protected void init() {
      super.init();

      int rIndex = 0;
      for( int i = 0; i < source.vecs().length; i++ )
        if( source.vecs()[i] == response )
          rIndex = i;
      _responseName = source._names != null && rIndex >= 0 ? source._names[rIndex] : "response";

      _train = selectVecs(source);
      _names = new String[cols.length];
      for( int i = 0; i < cols.length; i++ )
        _names[i] = source._names[cols[i]];

      // Compute source response domain
      if (classification) _sourceResponseDomain = getVectorDomain(response);
      // Is validation specified?
      if( validation != null ) {
        // Extract a validation response
        int idx = validation.find(source.names()[rIndex]);
        if( idx == -1 ) throw new IllegalArgumentException("Validation set does not have a response column called "+_responseName);
        _validResponse = validation.vecs()[idx];
        // Compute output confusion matrix domain for classification:
        // - if validation dataset is specified then CM domain is union of train and validation response domains
        //   else it is only domain of response column.
        if (classification) {
          _validResponseDomain  = getVectorDomain(_validResponse);
          if (_validResponseDomain!=null) {
            _cmDomain = Utils.domainUnion(_sourceResponseDomain, _validResponseDomain);
            if (!Arrays.deepEquals(_sourceResponseDomain, _validResponseDomain)) {
              _fromModel2CM = Model.getDomainMapping(_cmDomain, _sourceResponseDomain, false); // transformation from model produced response ~> cmDomain
              _fromValid2CM = Model.getDomainMapping(_cmDomain, _validResponseDomain , false); // transformation from validation response domain ~> cmDomain
            }
          } else _cmDomain = _sourceResponseDomain;
        } /* end of if classification */
      } else if (classification) _cmDomain = _sourceResponseDomain;
    }
Пример #26
0
 protected final Vec[] selectVecs(Frame frame) {
   Vec[] vecs = new Vec[cols.length];
   for( int i = 0; i < cols.length; i++ )
     vecs[i] = frame.vecs()[cols[i]];
   return vecs;
 }
Пример #27
0
 public static boolean checkIdx(Frame source, int[] idx) {
   for (int i : idx) if (i<0 || i>source.vecs().length-1) return false;
   return true;
 }
Пример #28
0
 /** Put given frame vectors into local trash which can be emptied by a user calling the {@link #emptyLTrash()} method.
  * @see #emptyLTrash() */
 protected final void ltrash(Frame fr) {  ltrash(fr.vecs()); }
Пример #29
0
  public static Frame[] shuffleSplitFrame(
      Frame fr, Key[] keys, final double ratios[], final long seed) {
    // Sanity check the ratios
    assert keys.length == ratios.length;
    double sum = ratios[0];
    for (int i = 1; i < ratios.length; i++) {
      sum += ratios[i];
      ratios[i] = sum;
    }
    assert water.util.MathUtils.equalsWithinOneSmallUlp(sum, 1.0);

    // Do the split, into ratios.length groupings of NewChunks
    final int ncols = fr.numCols();
    MRTask mr =
        new MRTask() {
          @Override
          public void map(Chunk cs[], NewChunk ncs[]) {
            Random rng = new Random(seed * cs[0].cidx());
            int nrows = cs[0]._len;
            for (int i = 0; i < nrows; i++) {
              double r = rng.nextDouble();
              int x = 0; // Pick the NewChunk split
              for (; x < ratios.length - 1; x++) if (r < ratios[x]) break;
              x *= ncols;
              // Helper string holder
              ValueString vstr = new ValueString();
              // Copy row to correct set of NewChunks
              for (int j = 0; j < ncols; j++) {
                byte colType = cs[j].vec().get_type();
                switch (colType) {
                  case Vec.T_BAD:
                    break; /* NOP */
                  case Vec.T_STR:
                    ncs[x + j].addStr(cs[j], i);
                    break;
                  case Vec.T_UUID:
                    ncs[x + j].addUUID(cs[j], i);
                    break;
                  case Vec.T_NUM: /* fallthrough */
                  case Vec.T_ENUM:
                  case Vec.T_TIME:
                    ncs[x + j].addNum(cs[j].atd(i));
                    break;
                  default:
                    if (colType > Vec.T_TIME && colType <= Vec.T_TIMELAST)
                      ncs[x + j].addNum(cs[j].atd(i));
                    else throw new IllegalArgumentException("Unsupported vector type: " + colType);
                    break;
                }
              }
            }
          }
        }.doAll(ncols * ratios.length, fr);

    // Build output frames
    Frame frames[] = new Frame[ratios.length];
    Vec[] vecs = fr.vecs();
    String[] names = fr.names();
    Futures fs = new Futures();
    for (int i = 0; i < ratios.length; i++) {
      Vec[] nvecs = new Vec[ncols];
      for (int c = 0; c < ncols; c++) {
        mr.appendables()[i * ncols + c].setDomain(vecs[c].domain());
        nvecs[c] = mr.appendables()[i * ncols + c].close(fs);
      }
      frames[i] = new Frame(keys[i], fr.names(), nvecs);
      DKV.put(frames[i], fs);
    }
    fs.blockForPending();
    return frames;
  }
Пример #30
0
 public Frame(Frame fr) {
   this(fr._names.clone(), fr.vecs().clone());
   _col0 = null;
 }