Ejemplo n.º 1
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Val v = stk.track(asts[1].exec(env));
    if (v instanceof ValRow) {
      ValRow vv = (ValRow) v;
      return vv.slice(asts[2].columns(vv._names));
    }
    Frame fr = v.getFrame();
    int[] cols = asts[2].columns(fr.names());

    Frame fr2 = new Frame();
    if (cols.length == 0) { // Empty inclusion list?
    } else if (cols[0] >= 0) { // Positive (inclusion) list
      if (cols[cols.length - 1] > fr.numCols())
        throw new IllegalArgumentException(
            "Column must be an integer from 0 to " + (fr.numCols() - 1));
      for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]);
    } else { // Negative (exclusion) list
      fr2 = new Frame(fr); // All of them at first
      Arrays.sort(cols); // This loop depends on the values in sorted order
      for (int col : cols)
        if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column
    }

    return new ValFrame(fr2);
  }
Ejemplo n.º 2
0
  @Override
  public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
    Frame f = stk.track(asts[1].exec(env)).getFrame();
    AstRoot axisAR = asts[2];
    for (Vec v : f.vecs()) {
      if (v.isCategorical() || v.isString() || v.isUUID())
        throw new IllegalArgumentException(
            "Cumulative functions not applicable to enum, string, or UUID values");
    }
    double axis = axisAR.exec(env).getNum();
    if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1");
    if (f.numCols() == 1) {
      if (axis == 0.0) {
        AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init());
        t.doAll(new byte[] {Vec.T_NUM}, f.anyVec());
        final double[] chkCumu = t._chkCumu;
        Vec cumuVec = t.outputFrame().anyVec();
        new MRTask() {
          @Override
          public void map(Chunk c) {
            if (c.cidx() != 0) {
              double d = chkCumu[c.cidx() - 1];
              for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d));
            }
          }
        }.doAll(cumuVec);
        return new ValFrame(new Frame(cumuVec));
      } else {
        return new ValFrame(new Frame(f));
      }
    } else {

      if (axis == 0.0) { // down the column implementation

        AstCumu.CumuTaskWholeFrame t =
            new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        final double[][] chkCumu = t._chkCumu;
        new MRTask() {
          @Override
          public void map(Chunk cs[]) {
            if (cs[0].cidx() != 0) {
              for (int i = 0; i < cs.length; i++) {
                double d = chkCumu[i][cs[i].cidx() - 1];
                for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d));
              }
            }
          }
        }.doAll(fr2);
        return new ValFrame(new Frame(fr2));

      } else {
        AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        return new ValFrame(new Frame(fr2));
      }
    }
  }
Ejemplo n.º 3
0
 @Test
 public void testColumnwisesumBinaryVec() {
   assertTrue(vc1.isBinary() && !vc2.isBinary());
   Frame fr = register(new Frame(Key.<Frame>make(), ar("C1", "C2"), aro(vc1, vc2)));
   Val val = Rapids.exec("(sumaxis " + fr._key + " 1 0)");
   assertTrue(val instanceof ValFrame);
   Frame res = register(val.getFrame());
   assertArrayEquals(fr.names(), res.names());
   assertArrayEquals(ar(Vec.T_NUM, Vec.T_NUM), res.types());
   assertRowFrameEquals(ard(3.0, Double.NaN), res);
 }
Ejemplo n.º 4
0
  @Override
  protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) {
    Frame adaptFrm = new Frame(adaptedFr);
    for (int i = 0; i < _parms._k; i++)
      adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero());

    new MRTask() {
      @Override
      public void map(Chunk chks[]) {
        double tmp[] = new double[_output._names.length];
        double preds[] = new double[_parms._k];
        for (int row = 0; row < chks[0]._len; row++) {
          double p[] = score0(chks, row, tmp, preds);
          for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]);
        }
      }
    }.doAll(adaptFrm);

    // Return the projection into principal component space
    int x = _output._names.length, y = adaptFrm.numCols();
    Frame f =
        adaptFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f =
        new Frame(
            (null == destination_key ? Key.make() : Key.make(destination_key)),
            f.names(),
            f.vecs());
    DKV.put(f);
    makeMetricBuilder(null).makeModelMetrics(this, orig);
    return f;
  }
Ejemplo n.º 5
0
  /**
   * Project each archetype into original feature space
   *
   * @param frame Original training data with m rows and n columns
   * @param destination_key Frame Id for output
   * @return Frame containing k rows and n columns, where each row corresponds to an archetype
   */
  public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) {
    final int ncols = _output._names.length;
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();
    String[][] adaptedDomme = adaptedFr.domains();
    double[][] proj = new double[_parms._k][_output._nnums + _output._ncats];

    // Categorical columns
    for (int d = 0; d < _output._ncats; d++) {
      double[][] block = _output._archetypes_raw.getCatBlock(d);
      for (int k = 0; k < _parms._k; k++)
        proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]);
    }

    // Numeric columns
    for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) {
      int ds = d - _output._ncats;
      for (int k = 0; k < _parms._k; k++) {
        double num = _output._archetypes_raw.getNum(ds, k);
        proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]);
        if (reverse_transform)
          proj[k][_output._permutation[d]] =
              proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
      }
    }

    // Convert projection of archetypes into a frame with correct domains
    Frame f =
        ArrayUtils.frame(
            (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj);
    for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]);
    return f;
  }
Ejemplo n.º 6
0
 /**
  * Sample rows from a frame. Can be unlucky for small sampling fractions - will continue calling
  * itself until at least 1 row is returned.
  *
  * @param fr Input frame
  * @param rows Approximate number of rows to sample (across all chunks)
  * @param seed Seed for RNG
  * @return Sampled frame
  */
 public static Frame sampleFrame(Frame fr, final long rows, final long seed) {
   if (fr == null) return null;
   final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f;
   if (fraction >= 1.f) return fr;
   Frame r =
       new MRTask2() {
         @Override
         public void map(Chunk[] cs, NewChunk[] ncs) {
           final Random rng = getDeterRNG(seed + cs[0].cidx());
           int count = 0;
           for (int r = 0; r < cs[0]._len; r++)
             if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) {
               count++;
               for (int i = 0; i < ncs.length; i++) {
                 ncs[i].addNum(cs[i].at0(r));
               }
             }
         }
       }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains());
   if (r.numRows() == 0) {
     Log.warn(
         "You asked for "
             + rows
             + " rows (out of "
             + fr.numRows()
             + "), but you got none (seed="
             + seed
             + ").");
     Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\"");
     return sampleFrame(fr, rows, seed + 1);
   }
   return r;
 }
Ejemplo n.º 7
0
 @Override
 public void compute2() {
   _in.read_lock(_jobKey);
   // simply create a bogus new vector (don't even put it into KV) with appropriate number of lines
   // per chunk and then use it as a source to do multiple makeZero calls
   // to create empty vecs and than call RebalanceTask on each one of them.
   // RebalanceTask will fetch the appropriate src chunks and fetch the data from them.
   int rpc = (int) (_in.numRows() / _nchunks);
   int rem = (int) (_in.numRows() % _nchunks);
   long[] espc = new long[_nchunks + 1];
   Arrays.fill(espc, rpc);
   for (int i = 0; i < rem; ++i) ++espc[i];
   long sum = 0;
   for (int i = 0; i < espc.length; ++i) {
     long s = espc[i];
     espc[i] = sum;
     sum += s;
   }
   assert espc[espc.length - 1] == _in.numRows()
       : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1];
   final Vec[] srcVecs = _in.vecs();
   _out =
       new Frame(
           _okey,
           _in.names(),
           new Vec(Vec.newKey(), espc).makeZeros(srcVecs.length, _in.domains()));
   _out.delete_and_lock(_jobKey);
   new RebalanceTask(this, srcVecs).asyncExec(_out);
 }
Ejemplo n.º 8
0
 @Test
 public void testColumnwiseSumWithNaRm() {
   Frame fr =
       register(
           new Frame(
               Key.<Frame>make(),
               ar("I", "D", "DD", "DN", "T", "S", "C"),
               aro(vi1, vd1, vd2, vd3, vt1, vs1, vc2)));
   Val val = Rapids.exec("(sumaxis " + fr._key + " 1 0)");
   assertTrue(val instanceof ValFrame);
   Frame res = register(val.getFrame());
   assertArrayEquals(fr.names(), res.names());
   assertArrayEquals(
       ar(Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_TIME, Vec.T_NUM, Vec.T_NUM),
       res.types());
   assertRowFrameEquals(ard(0.0, 20.0, 3.0, 6.0, 50000150.0, Double.NaN, Double.NaN), res);
 }
Ejemplo n.º 9
0
 protected final Frame selectFrame(Frame frame) {
   Vec[] vecs = new Vec[cols.length];
   String[] names = new String[cols.length];
   for( int i = 0; i < cols.length; i++ ) {
     vecs[i] = frame.vecs()[cols[i]];
     names[i] = frame.names()[cols[i]];
   }
   return new Frame(names, vecs);
 }
Ejemplo n.º 10
0
  @Override
  public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);
    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);
    final int nsplits = templates.length;
    assert nsplits == ratios.length + 1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s = 0; s < nsplits; s++) {
      Frame split = new Frame(destKeys[s], dataset.names(), templates[s]);
      split.delete_and_lock(jobKey);
      splits[s] = split;
    }
    setPendingCount(1);
    H2O.submitTask(
        new H2OCountedCompleter(FrameSplitter.this) {
          @Override
          public void compute2() {
            setPendingCount(nsplits);
            for (int s = 0; s < nsplits; s++) {
              new FrameSplitTask(
                      new H2OCountedCompleter(this) { // Completer for this task
                        @Override
                        public void compute2() {}

                        @Override
                        public boolean onExceptionalCompletion(
                            Throwable ex, CountedCompleter caller) {
                          synchronized (
                              FrameSplitter
                                  .this) { // synchronized on this since can be accessed from
                            // different workers
                            workersExceptions =
                                workersExceptions != null
                                    ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1)
                                    : new Throwable[1];
                            workersExceptions[workersExceptions.length - 1] = ex;
                          }
                          tryComplete(); // we handle the exception so wait perform normal
                          // completion
                          return false;
                        }
                      },
                      datasetVecs,
                      ratios,
                      s)
                  .asyncExec(splits[s]);
            }
            tryComplete(); // complete the computation of nsplits-tasks
          }
        });
    tryComplete(); // complete the computation of thrown tasks
  }
Ejemplo n.º 11
0
 public DataInfo validDinfo(Frame valid) {
   DataInfo res =
       new DataInfo(
           _adaptedFrame,
           null,
           1,
           _useAllFactorLevels,
           TransformType.NONE,
           TransformType.NONE,
           _skipMissing,
           _imputeMissing,
           false,
           _weights,
           _offset,
           _fold);
   res._adaptedFrame = new Frame(_adaptedFrame.names(), valid.vecs(_adaptedFrame.names()));
   res._valid = true;
   return res;
 }
Ejemplo n.º 12
0
 public static Frame shuffleFramePerChunk(Key outputFrameKey, Frame fr, final long seed) {
   Frame r =
       new MRTask2() {
         @Override
         public void map(Chunk[] cs, NewChunk[] ncs) {
           long[] idx = new long[cs[0]._len];
           for (int r = 0; r < idx.length; ++r) idx[r] = r;
           Utils.shuffleArray(idx, seed);
           for (int r = 0; r < idx.length; ++r) {
             for (int i = 0; i < ncs.length; i++) {
               ncs[i].addNum(cs[i].at0((int) idx[r]));
             }
           }
         }
       }.doAll(fr.numCols(), fr).outputFrame(outputFrameKey, fr.names(), fr.domains());
   return r;
 }
Ejemplo n.º 13
0
 /**
  * Global redistribution of a Frame (balancing of chunks), done by calling process (all-to-one +
  * one-to-all)
  *
  * @param fr Input frame
  * @param seed RNG seed
  * @param shuffle whether to shuffle the data globally
  * @return Shuffled frame
  */
 public static Frame shuffleAndBalance(
     final Frame fr, int splits, long seed, final boolean local, final boolean shuffle) {
   if ((fr.vecs()[0].nChunks() < splits || shuffle) && fr.numRows() > splits) {
     Vec[] vecs = fr.vecs().clone();
     Log.info("Load balancing dataset, splitting it into up to " + splits + " chunks.");
     long[] idx = null;
     if (shuffle) {
       idx = new long[splits];
       for (int r = 0; r < idx.length; ++r) idx[r] = r;
       Utils.shuffleArray(idx, seed);
     }
     Key keys[] = new Vec.VectorGroup().addVecs(vecs.length);
     final long rows_per_new_chunk = (long) (Math.ceil((double) fr.numRows() / splits));
     // loop over cols (same indexing for each column)
     Futures fs = new Futures();
     for (int col = 0; col < vecs.length; col++) {
       AppendableVec vec = new AppendableVec(keys[col]);
       // create outgoing chunks for this col
       NewChunk[] outCkg = new NewChunk[splits];
       for (int i = 0; i < splits; ++i) outCkg[i] = new NewChunk(vec, i);
       // loop over all incoming chunks
       for (int ckg = 0; ckg < vecs[col].nChunks(); ckg++) {
         final Chunk inCkg = vecs[col].chunkForChunkIdx(ckg);
         // loop over local rows of incoming chunks (fast path)
         for (int row = 0; row < inCkg._len; ++row) {
           int outCkgIdx =
               (int) ((inCkg._start + row) / rows_per_new_chunk); // destination chunk idx
           if (shuffle)
             outCkgIdx = (int) (idx[outCkgIdx]); // shuffle: choose a different output chunk
           assert (outCkgIdx >= 0 && outCkgIdx < splits);
           outCkg[outCkgIdx].addNum(inCkg.at0(row));
         }
       }
       for (int i = 0; i < outCkg.length; ++i) outCkg[i].close(i, fs);
       Vec t = vec.close(fs);
       t._domain = vecs[col]._domain;
       vecs[col] = t;
     }
     fs.blockForPending();
     Log.info("Load balancing done.");
     return new Frame(fr.names(), vecs);
   }
   return fr;
 }
Ejemplo n.º 14
0
    private void applyTrainingFrameSideEffects() {
      int numCols = _modelBuilderTrain.numCols();
      String responseVecName = _modelBuilderTrain.names()[numCols - 1];
      Vec responseVec = _modelBuilderTrain.remove(numCols - 1);

      final boolean use_weights_column = (_parms.weights_column != null);
      final boolean use_start_column = (_parms.start_column != null);

      if (use_weights_column) {
        Vec weightsVec = _parms.weights_column;
        int idxInRawFrame = _train.find(weightsVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find weightVec");
        }

        String weightsVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(weightsVecName, weightsVec);
      }

      if (use_start_column) {
        Vec startVec = _parms.start_column;
        int idxInRawFrame = _train.find(startVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find startVec");
        }

        String startVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(startVecName, startVec);
      }

      {
        Vec stopVec = _parms.stop_column;
        int idxInRawFrame = _train.find(stopVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find stopVec");
        }

        String stopVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(stopVecName, stopVec);
      }

      _modelBuilderTrain.add(responseVecName, responseVec);
    }
Ejemplo n.º 15
0
    private void applyScoringFrameSideEffects() {
      final int offset_ncol = _parms.offset_columns == null ? 0 : _parms.offset_columns.length;
      if (offset_ncol == 0) {
        return;
      }

      int numCols = _modelBuilderTrain.numCols();
      String responseVecName = _modelBuilderTrain.names()[numCols - 1];
      Vec responseVec = _modelBuilderTrain.remove(numCols - 1);

      for (int i = 0; i < offset_ncol; i++) {
        Vec offsetVec = _parms.offset_columns[i];
        int idxInRawFrame = _train.find(offsetVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find offsetVec");
        }

        String offsetVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(offsetVecName, offsetVec);
      }

      _modelBuilderTrain.add(responseVecName, responseVec);
    }
Ejemplo n.º 16
0
  // GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell
  // (2015), Section 5.3)
  private Frame reconstruct(
      Frame orig,
      Frame adaptedFr,
      Key destination_key,
      boolean save_imputed,
      boolean reverse_transform) {
    final int ncols = _output._names.length;
    assert ncols == adaptedFr.numCols();
    String prefix = "reconstr_";

    // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
    // Note: A is adapted to original training frame, P has columns shuffled so cats come before
    // nums!
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);
    String[][] adaptedDomme = adaptedFr.domains();
    for (int i = 0; i < ncols; i++) {
      Vec v = fullFrm.anyVec().makeZero();
      v.setDomain(adaptedDomme[i]);
      fullFrm.add(prefix + _output._names[i], v);
    }
    GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);

    // Return the imputed training frame
    int x = ncols + _parms._k, y = fullFrm.numCols();
    Frame f =
        fullFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs());
    DKV.put(f);
    gs._mb.makeModelMetrics(
        GLRMModel.this, orig, null, null); // save error metrics based on imputed data
    return f;
  }
Ejemplo n.º 17
0
  public static Frame[] shuffleSplitFrame(
      Frame fr, Key[] keys, final double ratios[], final long seed) {
    // Sanity check the ratios
    assert keys.length == ratios.length;
    double sum = ratios[0];
    for (int i = 1; i < ratios.length; i++) {
      sum += ratios[i];
      ratios[i] = sum;
    }
    assert water.util.MathUtils.equalsWithinOneSmallUlp(sum, 1.0);

    // Do the split, into ratios.length groupings of NewChunks
    final int ncols = fr.numCols();
    MRTask mr =
        new MRTask() {
          @Override
          public void map(Chunk cs[], NewChunk ncs[]) {
            Random rng = new Random(seed * cs[0].cidx());
            int nrows = cs[0]._len;
            for (int i = 0; i < nrows; i++) {
              double r = rng.nextDouble();
              int x = 0; // Pick the NewChunk split
              for (; x < ratios.length - 1; x++) if (r < ratios[x]) break;
              x *= ncols;
              // Helper string holder
              ValueString vstr = new ValueString();
              // Copy row to correct set of NewChunks
              for (int j = 0; j < ncols; j++) {
                byte colType = cs[j].vec().get_type();
                switch (colType) {
                  case Vec.T_BAD:
                    break; /* NOP */
                  case Vec.T_STR:
                    ncs[x + j].addStr(cs[j], i);
                    break;
                  case Vec.T_UUID:
                    ncs[x + j].addUUID(cs[j], i);
                    break;
                  case Vec.T_NUM: /* fallthrough */
                  case Vec.T_ENUM:
                  case Vec.T_TIME:
                    ncs[x + j].addNum(cs[j].atd(i));
                    break;
                  default:
                    if (colType > Vec.T_TIME && colType <= Vec.T_TIMELAST)
                      ncs[x + j].addNum(cs[j].atd(i));
                    else throw new IllegalArgumentException("Unsupported vector type: " + colType);
                    break;
                }
              }
            }
          }
        }.doAll(ncols * ratios.length, fr);

    // Build output frames
    Frame frames[] = new Frame[ratios.length];
    Vec[] vecs = fr.vecs();
    String[] names = fr.names();
    Futures fs = new Futures();
    for (int i = 0; i < ratios.length; i++) {
      Vec[] nvecs = new Vec[ncols];
      for (int c = 0; c < ncols; c++) {
        mr.appendables()[i * ncols + c].setDomain(vecs[c].domain());
        nvecs[c] = mr.appendables()[i * ncols + c].close(fs);
      }
      frames[i] = new Frame(keys[i], fr.names(), nvecs);
      DKV.put(frames[i], fs);
    }
    fs.blockForPending();
    return frames;
  }
Ejemplo n.º 18
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    Frame returningFrame;
    long nrows = fr.numRows();
    if (asts[2] instanceof ASTNumList) {
      final ASTNumList nums = (ASTNumList) asts[2];
      long[] rows = nums._isList ? nums.expand8Sort() : null;
      if (rows != null) {
        if (rows.length == 0) { // Empty inclusion list?
        } else if (rows[0] >= 0) { // Positive (inclusion) list
          if (rows[rows.length - 1] > nrows)
            throw new IllegalArgumentException("Row must be an integer from 0 to " + (nrows - 1));
        } else { // Negative (exclusion) list
          // Invert the list to make a positive list, ignoring out-of-bounds values
          BitSet bs = new BitSet((int) nrows);
          for (int i = 0; i < rows.length; i++) {
            int idx = (int) (-rows[i] - 1); // The positive index
            if (idx >= 0 && idx < nrows) bs.set(idx); // Set column to EXCLUDE
          }
          rows = new long[(int) nrows - bs.cardinality()];
          for (int i = bs.nextClearBit(0), j = 0; i < nrows; i = bs.nextClearBit(i + 1))
            rows[j++] = i;
        }
      }
      final long[] ls = rows;

      returningFrame =
          new MRTask() {
            @Override
            public void map(Chunk[] cs, NewChunk[] ncs) {
              if (nums.cnt() == 0) return;
              long start = cs[0].start();
              long end = start + cs[0]._len;
              long min = ls == null ? (long) nums.min() : ls[0],
                  max =
                      ls == null
                          ? (long) nums.max() - 1
                          : ls[ls.length - 1]; // exclusive max to inclusive max when stride == 1
              //     [ start, ...,  end ]     the chunk
              // 1 []                          nums out left:  nums.max() < start
              // 2                         []  nums out rite:  nums.min() > end
              // 3 [ nums ]                    nums run left:  nums.min() < start && nums.max() <=
              // end
              // 4          [ nums ]           nums run in  :  start <= nums.min() && nums.max() <=
              // end
              // 5                   [ nums ]  nums run rite:  start <= nums.min() && end <
              // nums.max()
              if (!(max < start || min > end)) { // not situation 1 or 2 above
                long startOffset = (min > start ? min : start); // situation 4 and 5 => min > start;
                for (int i = (int) (startOffset - start); i < cs[0]._len; ++i) {
                  if ((ls == null && nums.has(start + i))
                      || (ls != null && Arrays.binarySearch(ls, start + i) >= 0)) {
                    for (int c = 0; c < cs.length; ++c) {
                      if (cs[c] instanceof CStrChunk) ncs[c].addStr(cs[c], i);
                      else if (cs[c] instanceof C16Chunk) ncs[c].addUUID(cs[c], i);
                      else if (cs[c].isNA(i)) ncs[c].addNA();
                      else ncs[c].addNum(cs[c].atd(i));
                    }
                  }
                }
              }
            }
          }.doAll(fr.types(), fr).outputFrame(fr.names(), fr.domains());
    } else if ((asts[2] instanceof ASTNum)) {
      long[] rows = new long[] {(long) (((ASTNum) asts[2])._v.getNum())};
      returningFrame = fr.deepSlice(rows, null);
    } else if ((asts[2] instanceof ASTExec) || (asts[2] instanceof ASTId)) {
      Frame predVec = stk.track(asts[2].exec(env)).getFrame();
      if (predVec.numCols() != 1)
        throw new IllegalArgumentException(
            "Conditional Row Slicing Expression evaluated to "
                + predVec.numCols()
                + " columns.  Must be a boolean Vec.");
      returningFrame = fr.deepSlice(predVec, null);
    } else
      throw new IllegalArgumentException(
          "Row slicing requires a number-list as the last argument, but found a "
              + asts[2].getClass());
    return new ValFrame(returningFrame);
  }
Ejemplo n.º 19
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {

    // Execute all args.  Find a canonical frame; all Frames must look like this one.
    // Each argument turns into either a Frame (whose rows are entirely
    // inlined) or a scalar (which is replicated across as a single row).
    Frame fr = null; // Canonical Frame; all frames have the same column count, types and names
    int nchks = 0; // Total chunks
    Val vals[] = new Val[asts.length]; // Computed AST results
    for (int i = 1; i < asts.length; i++) {
      vals[i] = stk.track(asts[i].exec(env));
      if (vals[i].isFrame()) {
        fr = vals[i].getFrame();
        nchks += fr.anyVec().nChunks(); // Total chunks
      } else nchks++; // One chunk per scalar
    }
    // No Frame, just a pile-o-scalars?
    Vec zz = null; // The zero-length vec for the zero-frame frame
    if (fr == null) { // Zero-length, 1-column, default name
      fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)});
      if (asts.length == 1) return new ValFrame(fr);
    }

    // Verify all Frames are the same columns, names, and types.  Domains can vary, and will be the
    // union
    final Frame frs[] = new Frame[asts.length]; // Input frame
    final byte[] types = fr.types(); // Column types
    final int ncols = fr.numCols();
    final long[] espc = new long[nchks + 1]; // Compute a new layout!
    int coffset = 0;

    for (int i = 1; i < asts.length; i++) {
      Val val = vals[i]; // Save values computed for pass 2
      Frame fr0 =
          val.isFrame()
              ? val.getFrame()
              // Scalar: auto-expand into a 1-row frame
              : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols())));

      // Check that all frames are compatible
      if (fr.numCols() != fr0.numCols())
        throw new IllegalArgumentException(
            "rbind frames must have all the same columns, found "
                + fr.numCols()
                + " and "
                + fr0.numCols()
                + " columns.");
      if (!Arrays.deepEquals(fr._names, fr0._names))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column names, found "
                + Arrays.toString(fr._names)
                + " and "
                + Arrays.toString(fr0._names));
      if (!Arrays.equals(types, fr0.types()))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column types, found "
                + Arrays.toString(types)
                + " and "
                + Arrays.toString(fr0.types()));

      frs[i] = fr0; // Save frame

      // Roll up the ESPC row counts
      long roffset = espc[coffset];
      long[] espc2 = fr0.anyVec().espc();
      for (int j = 1; j < espc2.length; j++) // Roll up the row counts
      espc[coffset + j] = (roffset + espc2[j]);
      coffset += espc2.length - 1; // Chunk offset
    }
    if (zz != null) zz.remove();

    // build up the new domains for each vec
    HashMap<String, Integer>[] dmap = new HashMap[types.length];
    String[][] domains = new String[types.length][];
    int[][][] cmaps = new int[types.length][][];
    for (int k = 0; k < types.length; ++k) {
      dmap[k] = new HashMap<>();
      int c = 0;
      byte t = types[k];
      if (t == Vec.T_CAT) {
        int[][] maps = new int[frs.length][];
        for (int i = 1; i < frs.length; i++) {
          maps[i] = new int[frs[i].vec(k).domain().length];
          for (int j = 0; j < maps[i].length; j++) {
            String s = frs[i].vec(k).domain()[j];
            if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++);
            else maps[i][j] = dmap[k].get(s);
          }
        }
        cmaps[k] = maps;
      } else {
        cmaps[k] = new int[frs.length][];
      }
      domains[k] = c == 0 ? null : new String[c];
      for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey();
    }

    // Now make Keys for the new Vecs
    Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols());
    Vec[] vecs = new Vec[fr.numCols()];
    int rowLayout = Vec.ESPC.rowLayout(keys[0], espc);
    for (int i = 0; i < vecs.length; i++)
      vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]);

    // Do the row-binds column-by-column.
    // Switch to F/J thread for continuations
    ParallelRbinds t;
    H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join();
    return new ValFrame(new Frame(fr.names(), t._vecs));
  }
Ejemplo n.º 20
0
  // internal version with repeat counter
  // currently hardcoded to do up to 10 tries to get a row from each class, which can be impossible
  // for certain wrong sampling ratios
  private static Frame sampleFrameStratified(
      final Frame fr,
      Vec label,
      final float[] sampling_ratios,
      final long seed,
      final boolean debug,
      int count) {
    if (fr == null) return null;
    assert (label.isEnum());
    assert (sampling_ratios != null && sampling_ratios.length == label.domain().length);
    final int labelidx = fr.find(label); // which column is the label?
    assert (labelidx >= 0);

    final boolean poisson = false; // beta feature

    Frame r =
        new MRTask2() {
          @Override
          public void map(Chunk[] cs, NewChunk[] ncs) {
            final Random rng = getDeterRNG(seed + cs[0].cidx());
            for (int r = 0; r < cs[0]._len; r++) {
              if (cs[labelidx].isNA0(r)) continue; // skip missing labels
              final int label = (int) cs[labelidx].at80(r);
              assert (sampling_ratios.length > label && label >= 0);
              int sampling_reps;
              if (poisson) {
                sampling_reps = Utils.getPoisson(sampling_ratios[label], rng);
              } else {
                final float remainder = sampling_ratios[label] - (int) sampling_ratios[label];
                sampling_reps =
                    (int) sampling_ratios[label] + (rng.nextFloat() < remainder ? 1 : 0);
              }
              for (int i = 0; i < ncs.length; i++) {
                for (int j = 0; j < sampling_reps; ++j) {
                  ncs[i].addNum(cs[i].at0(r));
                }
              }
            }
          }
        }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains());

    // Confirm the validity of the distribution
    long[] dist = new ClassDist(r.vecs()[labelidx]).doAll(r.vecs()[labelidx]).dist();

    // if there are no training labels in the test set, then there is no point in sampling the test
    // set
    if (dist == null) return fr;

    if (debug) {
      long sumdist = Utils.sum(dist);
      Log.info("After stratified sampling: " + sumdist + " rows.");
      for (int i = 0; i < dist.length; ++i) {
        Log.info(
            "Class "
                + r.vecs()[labelidx].domain(i)
                + ": count: "
                + dist[i]
                + " sampling ratio: "
                + sampling_ratios[i]
                + " actual relative frequency: "
                + (float) dist[i] / sumdist * dist.length);
      }
    }

    // Re-try if we didn't get at least one example from each class
    if (Utils.minValue(dist) == 0 && count < 10) {
      Log.info(
          "Re-doing stratified sampling because not all classes were represented (unlucky draw).");
      r.delete();
      return sampleFrameStratified(fr, label, sampling_ratios, seed + 1, debug, ++count);
    }

    // shuffle intra-chunk
    Frame shuffled = shuffleFramePerChunk(r, seed + 0x580FF13);
    r.delete();

    return shuffled;
  }
Ejemplo n.º 21
0
  /** Score a frame with the given model and return the metrics AND the prediction frame. */
  @SuppressWarnings("unused") // called through reflection by RequestServer
  public ModelMetricsListSchemaV3 predict(int version, ModelMetricsListSchemaV3 s) {
    // parameters checking:
    if (null == s.model) throw new H2OIllegalArgumentException("model", "predict", s.model);
    if (null == DKV.get(s.model.name))
      throw new H2OKeyNotFoundArgumentException("model", "predict", s.model.name);

    if (null == s.frame) throw new H2OIllegalArgumentException("frame", "predict", s.frame);
    if (null == DKV.get(s.frame.name))
      throw new H2OKeyNotFoundArgumentException("frame", "predict", s.frame.name);

    ModelMetricsList parms = s.createAndFillImpl();

    Frame predictions;
    if (!s.reconstruction_error
        && !s.reconstruction_error_per_feature
        && s.deep_features_hidden_layer < 0
        && !s.project_archetypes
        && !s.reconstruct_train
        && !s.leaf_node_assignment) {
      if (null == parms._predictions_name)
        parms._predictions_name =
            "predictions"
                + Key.make().toString().substring(0, 5)
                + "_"
                + parms._model._key.toString()
                + "_on_"
                + parms._frame._key.toString();
      predictions = parms._model.score(parms._frame, parms._predictions_name);
    } else {
      if (Model.DeepFeatures.class.isAssignableFrom(parms._model.getClass())) {
        if (s.reconstruction_error || s.reconstruction_error_per_feature) {
          if (s.deep_features_hidden_layer >= 0)
            throw new H2OIllegalArgumentException(
                "Can only compute either reconstruction error OR deep features.", "");
          if (null == parms._predictions_name)
            parms._predictions_name =
                "reconstruction_error"
                    + Key.make().toString().substring(0, 5)
                    + "_"
                    + parms._model._key.toString()
                    + "_on_"
                    + parms._frame._key.toString();
          predictions =
              ((Model.DeepFeatures) parms._model)
                  .scoreAutoEncoder(
                      parms._frame,
                      Key.make(parms._predictions_name),
                      parms._reconstruction_error_per_feature);
        } else {
          if (s.deep_features_hidden_layer < 0)
            throw new H2OIllegalArgumentException(
                "Deep features hidden layer index must be >= 0.", "");
          if (null == parms._predictions_name)
            parms._predictions_name =
                "deep_features"
                    + Key.make().toString().substring(0, 5)
                    + "_"
                    + parms._model._key.toString()
                    + "_on_"
                    + parms._frame._key.toString();
          predictions =
              ((Model.DeepFeatures) parms._model)
                  .scoreDeepFeatures(parms._frame, s.deep_features_hidden_layer);
        }
        predictions =
            new Frame(Key.make(parms._predictions_name), predictions.names(), predictions.vecs());
        DKV.put(predictions._key, predictions);
      } else if (Model.GLRMArchetypes.class.isAssignableFrom(parms._model.getClass())) {
        if (s.project_archetypes) {
          if (null == parms._predictions_name)
            parms._predictions_name =
                "reconstructed_archetypes_"
                    + Key.make().toString().substring(0, 5)
                    + "_"
                    + parms._model._key.toString()
                    + "_of_"
                    + parms._frame._key.toString();
          predictions =
              ((Model.GLRMArchetypes) parms._model)
                  .scoreArchetypes(
                      parms._frame, Key.make(parms._predictions_name), s.reverse_transform);
        } else {
          assert s.reconstruct_train;
          if (null == parms._predictions_name)
            parms._predictions_name =
                "reconstruction_"
                    + Key.make().toString().substring(0, 5)
                    + "_"
                    + parms._model._key.toString()
                    + "_of_"
                    + parms._frame._key.toString();
          predictions =
              ((Model.GLRMArchetypes) parms._model)
                  .scoreReconstruction(
                      parms._frame, Key.make(parms._predictions_name), s.reverse_transform);
        }
      } else if (Model.LeafNodeAssignment.class.isAssignableFrom(parms._model.getClass())) {
        assert (s.leaf_node_assignment);
        if (null == parms._predictions_name)
          parms._predictions_name =
              "leaf_node_assignement"
                  + Key.make().toString().substring(0, 5)
                  + "_"
                  + parms._model._key.toString()
                  + "_on_"
                  + parms._frame._key.toString();
        predictions =
            ((Model.LeafNodeAssignment) parms._model)
                .scoreLeafNodeAssignment(parms._frame, Key.make(parms._predictions_name));
      } else
        throw new H2OIllegalArgumentException(
            "Requires a Deep Learning, GLRM, DRF or GBM model.",
            "Model must implement specific methods.");
    }

    ModelMetricsListSchemaV3 mm = this.fetch(version, s);

    // TODO: for now only binary predictors write an MM object.
    // For the others cons one up here to return the predictions frame.
    if (null == mm) mm = new ModelMetricsListSchemaV3();

    mm.predictions_frame = new KeyV3.FrameKeyV3(predictions._key);
    if (parms._leaf_node_assignment) // don't show metrics in leaf node assignments are made
    mm.model_metrics = null;

    if (null == mm.model_metrics || 0 == mm.model_metrics.length) {
      // There was no response in the test set -> cannot make a model_metrics object
    } else {
      mm.model_metrics[0].predictions =
          new FrameV3(predictions, 0, 100); // TODO: Should call schema(version)
    }
    return mm;
  }
Ejemplo n.º 22
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return this;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = cols[i++] - _catOffsets[j] + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] c = new int[_cats - ignoredCnt][];
     int y = 0;
     for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl;
     assert y == c.length;
     catLvls = c;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < cols.length; ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < cols.length; ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   DataInfo dinfo =
       new DataInfo(
           _key,
           f,
           normMul,
           normSub,
           catLvls,
           _responses,
           _predictor_transform,
           _response_transform,
           _skipMissing,
           _imputeMissing,
           _weights,
           _offset,
           _fold);
   // do not put activeData into K/V - active data is recreated on each node based on active
   // columns
   dinfo._activeCols = cols;
   return dinfo;
 }
Ejemplo n.º 23
0
 public DataInfo filterExpandedColumns(int[] cols) {
   assert _predictor_transform != null;
   assert _response_transform != null;
   if (cols == null) return deep_clone();
   int hasIcpt = (cols.length > 0 && cols[cols.length - 1] == fullN()) ? 1 : 0;
   int i = 0, j = 0, ignoredCnt = 0;
   // public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub,
   // double [] normMul, double [] normRespSub, double [] normRespMul){
   int[][] catLvls = new int[_cats][];
   int[] ignoredCols = MemoryManager.malloc4(_nums + _cats);
   // first do categoricals...
   if (_catOffsets != null) {
     int coff = _useAllFactorLevels ? 0 : 1;
     while (i < cols.length && cols[i] < _catOffsets[_catOffsets.length - 1]) {
       int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]);
       int k = 0;
       while (i < cols.length && cols[i] < _catOffsets[j + 1])
         levels[k++] = (cols[i++] - _catOffsets[j]) + coff;
       if (k > 0) catLvls[j] = Arrays.copyOf(levels, k);
       ++j;
     }
   }
   int[] catModes = _catModes;
   for (int k = 0; k < catLvls.length; ++k) if (catLvls[k] == null) ignoredCols[ignoredCnt++] = k;
   if (ignoredCnt > 0) {
     int[][] cs = new int[_cats - ignoredCnt][];
     catModes = new int[_cats - ignoredCnt];
     int y = 0;
     for (int c = 0; c < catLvls.length; ++c)
       if (catLvls[c] != null) {
         catModes[y] = _catModes[c];
         cs[y++] = catLvls[c];
       }
     assert y == cs.length;
     catLvls = cs;
   }
   // now numerics
   int prev = j = 0;
   for (; i < cols.length; ++i) {
     for (int k = prev; k < (cols[i] - numStart()); ++k) {
       ignoredCols[ignoredCnt++] = k + _cats;
       ++j;
     }
     prev = ++j;
   }
   for (int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k + _cats;
   Frame f = new Frame(_adaptedFrame.names().clone(), _adaptedFrame.vecs().clone());
   if (ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols, ignoredCnt));
   assert catLvls.length < f.numCols() : "cats = " + catLvls.length + " numcols = " + f.numCols();
   double[] normSub = null;
   double[] normMul = null;
   int id = Arrays.binarySearch(cols, numStart());
   if (id < 0) id = -id - 1;
   int nnums = cols.length - id - hasIcpt;
   int off = numStart();
   if (_normSub != null) {
     normSub = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normSub[k - id] = _normSub[cols[k] - off];
   }
   if (_normMul != null) {
     normMul = new double[nnums];
     for (int k = id; k < (id + nnums); ++k) normMul[k - id] = _normMul[cols[k] - off];
   }
   // public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels,
   // TransformType predictor_transform, TransformType response_transform, boolean skipMissing,
   // boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) {
   DataInfo dinfo = new DataInfo(this, f, normMul, normSub, catLvls, catModes);
   dinfo._activeCols = cols;
   return dinfo;
 }