Пример #1
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Val v = stk.track(asts[1].exec(env));
    if (v instanceof ValRow) {
      ValRow vv = (ValRow) v;
      return vv.slice(asts[2].columns(vv._names));
    }
    Frame fr = v.getFrame();
    int[] cols = asts[2].columns(fr.names());

    Frame fr2 = new Frame();
    if (cols.length == 0) { // Empty inclusion list?
    } else if (cols[0] >= 0) { // Positive (inclusion) list
      if (cols[cols.length - 1] > fr.numCols())
        throw new IllegalArgumentException(
            "Column must be an integer from 0 to " + (fr.numCols() - 1));
      for (int col : cols) fr2.add(fr.names()[col], fr.vecs()[col]);
    } else { // Negative (exclusion) list
      fr2 = new Frame(fr); // All of them at first
      Arrays.sort(cols); // This loop depends on the values in sorted order
      for (int col : cols)
        if (0 <= -col - 1 && -col - 1 < fr.numCols()) fr2.remove(-col - 1); // Remove named column
    }

    return new ValFrame(fr2);
  }
Пример #2
0
  @Override
  public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
    Frame f = stk.track(asts[1].exec(env)).getFrame();
    AstRoot axisAR = asts[2];
    for (Vec v : f.vecs()) {
      if (v.isCategorical() || v.isString() || v.isUUID())
        throw new IllegalArgumentException(
            "Cumulative functions not applicable to enum, string, or UUID values");
    }
    double axis = axisAR.exec(env).getNum();
    if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1");
    if (f.numCols() == 1) {
      if (axis == 0.0) {
        AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init());
        t.doAll(new byte[] {Vec.T_NUM}, f.anyVec());
        final double[] chkCumu = t._chkCumu;
        Vec cumuVec = t.outputFrame().anyVec();
        new MRTask() {
          @Override
          public void map(Chunk c) {
            if (c.cidx() != 0) {
              double d = chkCumu[c.cidx() - 1];
              for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d));
            }
          }
        }.doAll(cumuVec);
        return new ValFrame(new Frame(cumuVec));
      } else {
        return new ValFrame(new Frame(f));
      }
    } else {

      if (axis == 0.0) { // down the column implementation

        AstCumu.CumuTaskWholeFrame t =
            new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        final double[][] chkCumu = t._chkCumu;
        new MRTask() {
          @Override
          public void map(Chunk cs[]) {
            if (cs[0].cidx() != 0) {
              for (int i = 0; i < cs.length; i++) {
                double d = chkCumu[i][cs[i].cidx() - 1];
                for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d));
              }
            }
          }
        }.doAll(fr2);
        return new ValFrame(new Frame(fr2));

      } else {
        AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        return new ValFrame(new Frame(fr2));
      }
    }
  }
Пример #3
0
 @Test
 public void testDomains() {
   Frame frame = parse_test_file("smalldata/junit/weather.csv");
   for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) {
     Vec v = frame.vec(s);
     Vec newV = v.toCategoricalVec();
     frame.remove(s);
     frame.add(s, newV);
     v.remove();
   }
   DKV.put(frame);
   AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
   parms._train = frame._key;
   parms._radius_scale = 10;
   AggregatorModel agg = new Aggregator(parms).trainModel().get();
   Frame output = agg._output._output_frame.get();
   Assert.assertTrue(output.numRows() < 0.5 * frame.numRows());
   boolean same = true;
   for (int i = 0; i < frame.numCols(); ++i) {
     if (frame.vec(i).isCategorical()) {
       same = (frame.domains()[i].length == output.domains()[i].length);
       if (!same) break;
     }
   }
   frame.remove();
   output.remove();
   agg.remove();
   Assert.assertFalse(same);
 }
Пример #4
0
 private static void assertRowFrameEquals(double[] expected, Frame actual) {
   assertEquals(1, actual.numRows());
   assertEquals(expected.length, actual.numCols());
   for (int i = 0; i < expected.length; i++) {
     assertEquals("Wrong sum in column " + actual.name(i), expected[i], actual.vec(i).at(0), 1e-8);
   }
 }
Пример #5
0
  /**
   * Project each archetype into original feature space
   *
   * @param frame Original training data with m rows and n columns
   * @param destination_key Frame Id for output
   * @return Frame containing k rows and n columns, where each row corresponds to an archetype
   */
  public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) {
    final int ncols = _output._names.length;
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();
    String[][] adaptedDomme = adaptedFr.domains();
    double[][] proj = new double[_parms._k][_output._nnums + _output._ncats];

    // Categorical columns
    for (int d = 0; d < _output._ncats; d++) {
      double[][] block = _output._archetypes_raw.getCatBlock(d);
      for (int k = 0; k < _parms._k; k++)
        proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]);
    }

    // Numeric columns
    for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) {
      int ds = d - _output._ncats;
      for (int k = 0; k < _parms._k; k++) {
        double num = _output._archetypes_raw.getNum(ds, k);
        proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]);
        if (reverse_transform)
          proj[k][_output._permutation[d]] =
              proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
      }
    }

    // Convert projection of archetypes into a frame with correct domains
    Frame f =
        ArrayUtils.frame(
            (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj);
    for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]);
    return f;
  }
Пример #6
0
 private static void assertColFrameEquals(double[] expected, Frame actual) {
   assertEquals(1, actual.numCols());
   assertEquals(expected.length, actual.numRows());
   for (int i = 0; i < expected.length; i++) {
     assertEquals("Wrong sum in row " + i, expected[i], actual.vec(0).at(i), 1e-8);
   }
 }
Пример #7
0
  @Override
  protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) {
    Frame adaptFrm = new Frame(adaptedFr);
    for (int i = 0; i < _parms._k; i++)
      adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero());

    new MRTask() {
      @Override
      public void map(Chunk chks[]) {
        double tmp[] = new double[_output._names.length];
        double preds[] = new double[_parms._k];
        for (int row = 0; row < chks[0]._len; row++) {
          double p[] = score0(chks, row, tmp, preds);
          for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]);
        }
      }
    }.doAll(adaptFrm);

    // Return the projection into principal component space
    int x = _output._names.length, y = adaptFrm.numCols();
    Frame f =
        adaptFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f =
        new Frame(
            (null == destination_key ? Key.make() : Key.make(destination_key)),
            f.names(),
            f.vecs());
    DKV.put(f);
    makeMetricBuilder(null).makeModelMetrics(this, orig);
    return f;
  }
Пример #8
0
 /**
  * Sample rows from a frame. Can be unlucky for small sampling fractions - will continue calling
  * itself until at least 1 row is returned.
  *
  * @param fr Input frame
  * @param rows Approximate number of rows to sample (across all chunks)
  * @param seed Seed for RNG
  * @return Sampled frame
  */
 public static Frame sampleFrame(Frame fr, final long rows, final long seed) {
   if (fr == null) return null;
   final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f;
   if (fraction >= 1.f) return fr;
   Frame r =
       new MRTask2() {
         @Override
         public void map(Chunk[] cs, NewChunk[] ncs) {
           final Random rng = getDeterRNG(seed + cs[0].cidx());
           int count = 0;
           for (int r = 0; r < cs[0]._len; r++)
             if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) {
               count++;
               for (int i = 0; i < ncs.length; i++) {
                 ncs[i].addNum(cs[i].at0(r));
               }
             }
         }
       }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains());
   if (r.numRows() == 0) {
     Log.warn(
         "You asked for "
             + rows
             + " rows (out of "
             + fr.numRows()
             + "), but you got none (seed="
             + seed
             + ").");
     Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\"");
     return sampleFrame(fr, rows, seed + 1);
   }
   return r;
 }
Пример #9
0
 /**
  * Annotate the number of columns and rows of the training data set in the job parameter JSON
  * @return JsonObject annotated with num_cols and num_rows of the training data set
  */
 @Override protected JsonObject toJSON() {
   JsonObject jo = super.toJSON();
   if (source != null) {
     jo.getAsJsonObject("source").addProperty("num_cols", source.numCols());
     jo.getAsJsonObject("source").addProperty("num_rows", source.numRows());
   }
   return jo;
 }
Пример #10
0
 /**
  * Annotate the number of columns and rows of the validation data set in the job parameter JSON
  * @return JsonObject annotated with num_cols and num_rows of the validation data set
  */
 @Override protected JsonObject toJSON() {
   JsonObject jo = super.toJSON();
   if (validation != null) {
     jo.getAsJsonObject("validation").addProperty("num_cols", validation.numCols());
     jo.getAsJsonObject("validation").addProperty("num_rows", validation.numRows());
   }
   return jo;
 }
Пример #11
0
 @Test
 public void testColumnwisesumOnEmptyFrame() {
   Frame fr = register(new Frame(Key.<Frame>make()));
   Val val = Rapids.exec("(sumaxis " + fr._key + " 0 0)");
   assertTrue(val instanceof ValFrame);
   Frame res = register(val.getFrame());
   assertEquals(res.numCols(), 0);
   assertEquals(res.numRows(), 0);
 }
  /**
   * This method applies a stacked autoencoders model to a given dataset and make predictions
   *
   * @param ctxt JavaSparkContext
   * @param deeplearningModel Stacked Autoencoders model
   * @param test Testing dataset as a JavaRDD of labeled points
   * @return
   */
  public JavaPairRDD<Double, Double> test(
      JavaSparkContext ctxt,
      final DeepLearningModel deeplearningModel,
      JavaRDD<LabeledPoint> test,
      MLModel mlModel)
      throws MLModelBuilderException {

    Scope.enter();

    if (deeplearningModel == null) {
      throw new MLModelBuilderException("DeeplearningModel is Null");
    }

    int numberOfFeatures = mlModel.getFeatures().size();
    List<Feature> features = mlModel.getFeatures();
    String[] names = new String[numberOfFeatures + 1];
    for (int i = 0; i < numberOfFeatures; i++) {
      names[i] = features.get(i).getName();
    }
    names[numberOfFeatures] = mlModel.getResponseVariable();

    Frame testData = DeeplearningModelUtils.javaRDDToFrame(names, test);
    Frame testDataWithoutLabels = testData.subframe(0, testData.numCols() - 1);
    int numRows = (int) testDataWithoutLabels.numRows();
    Vec predictionsVector = deeplearningModel.score(testDataWithoutLabels).vec(0);
    double[] predictionValues = new double[numRows];
    for (int i = 0; i < numRows; i++) {
      predictionValues[i] = predictionsVector.at(i);
    }
    Vec labelsVector = testData.vec(testData.numCols() - 1);
    double[] labels = new double[numRows];
    for (int i = 0; i < numRows; i++) {
      labels[i] = labelsVector.at(i);
    }

    Scope.exit();

    ArrayList<Tuple2<Double, Double>> tupleList = new ArrayList<Tuple2<Double, Double>>();
    for (int i = 0; i < labels.length; i++) {
      tupleList.add(new Tuple2<Double, Double>(predictionValues[i], labels[i]));
    }

    return ctxt.parallelizePairs(tupleList);
  }
Пример #13
0
 // private constructor called by filterExpandedColumns
 private DataInfo(
     DataInfo dinfo,
     Frame fr,
     double[] normMul,
     double[] normSub,
     int[][] catLevels,
     int[] catModes) {
   _fullCatOffsets = dinfo._catOffsets;
   if (!dinfo._useAllFactorLevels) {
     _fullCatOffsets = dinfo._catOffsets.clone();
     for (int i = 0; i < _fullCatOffsets.length; ++i)
       _fullCatOffsets[i] += i; // add for the skipped zeros.
   }
   _offset = dinfo._offset;
   _weights = dinfo._weights;
   _fold = dinfo._fold;
   _valid = false;
   _interactions = dinfo._interactions;
   _interactionVecs = dinfo._interactionVecs;
   assert dinfo._predictor_transform != null;
   assert dinfo._response_transform != null;
   _predictor_transform = dinfo._predictor_transform;
   _response_transform = dinfo._response_transform;
   _skipMissing = dinfo._skipMissing;
   _imputeMissing = dinfo._imputeMissing;
   _adaptedFrame = fr;
   _catOffsets = MemoryManager.malloc4(catLevels.length + 1);
   _catMissing = new boolean[catLevels.length];
   Arrays.fill(_catMissing, !(dinfo._imputeMissing || dinfo._skipMissing));
   int s = 0;
   for (int i = 0; i < catLevels.length; ++i) {
     _catOffsets[i] = s;
     s += catLevels[i].length;
   }
   _catLvls = catLevels;
   _catOffsets[_catOffsets.length - 1] = s;
   _responses = dinfo._responses;
   _cats = catLevels.length;
   _nums =
       fr.numCols()
           - _cats
           - dinfo._responses
           - (_offset ? 1 : 0)
           - (_weights ? 1 : 0)
           - (_fold ? 1 : 0);
   _numOffsets = _nums == 0 ? new int[0] : dinfo._numOffsets.clone();
   int diff = _numOffsets.length > 0 ? _numOffsets[0] - s : 0;
   for (int i = 0; i < _numOffsets.length; i++) // need to shift everyone down by the offset!
   _numOffsets[i] -= diff;
   _useAllFactorLevels = true; // dinfo._useAllFactorLevels;
   _numMeans = new double[_nums];
   _normMul = normMul;
   _normSub = normSub;
   _catModes = catModes;
   for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean();
 }
Пример #14
0
 public void setResponseTransform(TransformType t) {
   _response_transform = t;
   if (t == TransformType.NONE) {
     _normRespMul = null;
     _normRespSub = null;
   } else {
     _normRespMul = MemoryManager.malloc8d(_responses);
     _normRespSub = MemoryManager.malloc8d(_responses);
     setTransform(t, _normRespMul, _normRespSub, _adaptedFrame.numCols() - _responses, _responses);
   }
 }
Пример #15
0
 @Override
 Val apply(Env env, Env.StackHelp stk, AST asts[]) {
   Frame fr = stk.track(asts[1].exec(env)).getFrame();
   if (fr.numCols() == 1 && fr.numRows() == 1) {
     if (fr.anyVec().isNumeric() || fr.anyVec().isBad()) return new ValNum(fr.anyVec().at(0));
     else if (fr.anyVec().isString())
       return new ValStr(fr.anyVec().atStr(new BufferedString(), 0).toString());
     return new ValStr(fr.domains()[0][(int) fr.anyVec().at8(0)]);
   }
   return new ValFrame(fr); // did not flatten
 }
Пример #16
0
 @Override
 ValFrame apply(Env env, Env.StackHelp stk, AST asts[]) {
   Frame fr = stk.track(asts[1].exec(env)).getFrame();
   double frac = asts[2].exec(env).getNum();
   double nrow = fr.numRows() * frac;
   Vec vecs[] = fr.vecs();
   long[] idxs = new long[fr.numCols()];
   int j = 0;
   for (int i = 0; i < idxs.length; i++) if (vecs[i].naCnt() < nrow) idxs[j++] = i;
   Vec vec = Vec.makeVec(Arrays.copyOf(idxs, j), null, Vec.VectorGroup.VG_LEN1.addVec());
   return new ValFrame(new Frame(vec));
 }
Пример #17
0
 // Make vector templates for all output frame vectors
 private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
   Vec anyVec = dataset.anyVec();
   final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios);
   final int num = dataset.numCols(); // number of columns in input frame
   final int nsplits = espcPerSplit.length; // number of splits
   final String[][] domains = dataset.domains(); // domains
   Vec[][] t = new Vec[nsplits][ /*num*/]; // resulting vectors for all
   for (int i = 0; i < nsplits; i++) {
     // vectors for j-th split
     t[i] = new Vec(Vec.newKey(), espcPerSplit[i /*-th split*/]).makeZeros(num, domains);
   }
   return t;
 }
Пример #18
0
  public static Frame expandDataset(Frame fr, Key destkey) { // , int[] ignored) {
    ArrayList<Vec> nvecs = new ArrayList<Vec>();
    ArrayList<Vec> evecs = new ArrayList<Vec>();
    ArrayList<String> eNames = new ArrayList<String>();
    ArrayList<String> nNames = new ArrayList<String>();
    int[] offsets = new int[fr.numCols() + 1];
    Vec[] vecs = fr.vecs();
    int c = 0;
    // int ip = 0; //ignored pointer
    for (int i = 0; i < fr.numCols(); i++) {
      if (vecs[i]
          .isEnum()) { // && i != ignored[ip]) {//!fr._names {//_names[i]. { //equals(ignored)) {
        offsets[evecs.size()] = c;
        evecs.add(vecs[i]);
        String name = fr._names[i];
        c += vecs[i]._domain.length;
        for (String s : vecs[i]._domain) eNames.add(name + "." + s);
      } else {
        // if(i == ignored[ip] && ip < ignored.length - 1) ip++;
        nvecs.add(vecs[i]);
        nNames.add(fr._names[i]);
      }
    }
    offsets[evecs.size()] = c;
    if (evecs.isEmpty()) return fr;
    offsets = Arrays.copyOf(offsets, evecs.size() + 1);

    OneHot ss = new OneHot();
    ss._offsets = offsets;
    int l = offsets[evecs.size()];
    ss.doAll(l, evecs.toArray(new Vec[evecs.size()]));

    Frame fr2 = ss.outputFrame(destkey, eNames.toArray(new String[eNames.size()]), new String[l][]);
    fr2.add(
        new Frame(nNames.toArray(new String[nNames.size()]), nvecs.toArray(new Vec[nvecs.size()])),
        false);
    return fr2;
  }
Пример #19
0
  // GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell
  // (2015), Section 5.3)
  private Frame reconstruct(
      Frame orig,
      Frame adaptedFr,
      Key destination_key,
      boolean save_imputed,
      boolean reverse_transform) {
    final int ncols = _output._names.length;
    assert ncols == adaptedFr.numCols();
    String prefix = "reconstr_";

    // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
    // Note: A is adapted to original training frame, P has columns shuffled so cats come before
    // nums!
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);
    String[][] adaptedDomme = adaptedFr.domains();
    for (int i = 0; i < ncols; i++) {
      Vec v = fullFrm.anyVec().makeZero();
      v.setDomain(adaptedDomme[i]);
      fullFrm.add(prefix + _output._names[i], v);
    }
    GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);

    // Return the imputed training frame
    int x = ncols + _parms._k, y = fullFrm.numCols();
    Frame f =
        fullFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs());
    DKV.put(f);
    gs._mb.makeModelMetrics(
        GLRMModel.this, orig, null, null); // save error metrics based on imputed data
    return f;
  }
Пример #20
0
  // Scalar covariance for 1 row
  private ValNum scalar(Frame frx, Frame fry, Mode mode) {
    if (frx.numCols() != fry.numCols())
      throw new IllegalArgumentException(
          "Single rows must have the same number of columns, found "
              + frx.numCols()
              + " and "
              + fry.numCols());
    Vec vecxs[] = frx.vecs();
    Vec vecys[] = fry.vecs();
    double xmean = 0, ymean = 0, ncols = frx.numCols(), NACount = 0, xval, yval, ss = 0;
    for (int r = 0; r < ncols; r++) {
      xval = vecxs[r].at(0);
      yval = vecys[r].at(0);
      if (Double.isNaN(xval) || Double.isNaN(yval)) NACount++;
      else {
        xmean += xval;
        ymean += yval;
      }
    }
    xmean /= (ncols - NACount);
    ymean /= (ncols - NACount);

    if (NACount != 0) {
      if (mode.equals(Mode.AllObs))
        throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
      if (mode.equals(Mode.Everything)) return new ValNum(Double.NaN);
    }

    for (int r = 0; r < ncols; r++) {
      xval = vecxs[r].at(0);
      yval = vecys[r].at(0);
      if (!(Double.isNaN(xval) || Double.isNaN(yval)))
        ss += (vecxs[r].at(0) - xmean) * (vecys[r].at(0) - ymean);
    }
    return new ValNum(ss / (ncols - NACount - 1));
  }
Пример #21
0
 // private constructor called by filterExpandedColumns
 private DataInfo(
     Key<DataInfo> selfKey,
     Frame fr,
     double[] normMul,
     double[] normSub,
     int[][] catLevels,
     int responses,
     TransformType predictor_transform,
     TransformType response_transform,
     boolean skipMissing,
     boolean imputeMissing,
     boolean weight,
     boolean offset,
     boolean fold) {
   super(selfKey);
   _offset = offset;
   _weights = weight;
   _fold = fold;
   _valid = false;
   assert predictor_transform != null;
   assert response_transform != null;
   _predictor_transform = predictor_transform;
   _response_transform = response_transform;
   _skipMissing = skipMissing;
   _imputeMissing = imputeMissing;
   _adaptedFrame = fr;
   _catOffsets = MemoryManager.malloc4(catLevels.length + 1);
   _catMissing = new int[catLevels.length];
   int s = 0;
   for (int i = 0; i < catLevels.length; ++i) {
     _catOffsets[i] = s;
     s += catLevels[i].length;
   }
   _catLvls = catLevels;
   _catOffsets[_catOffsets.length - 1] = s;
   _responses = responses;
   _cats = catLevels.length;
   _nums =
       fr.numCols() - _cats - responses - (_offset ? 1 : 0) - (_weights ? 1 : 0) - (_fold ? 1 : 0);
   _useAllFactorLevels = true;
   _catModes = new int[_cats];
   _numMeans = new double[_nums];
   _normMul = normMul;
   _normSub = normSub;
   for (int i = 0; i < _cats; i++) _catModes[i] = imputeCat(_adaptedFrame.vec(i));
   for (int i = 0; i < _nums; i++) _numMeans[i] = _adaptedFrame.vec(_cats + i).mean();
 }
Пример #22
0
 public static Frame shuffleFramePerChunk(Key outputFrameKey, Frame fr, final long seed) {
   Frame r =
       new MRTask2() {
         @Override
         public void map(Chunk[] cs, NewChunk[] ncs) {
           long[] idx = new long[cs[0]._len];
           for (int r = 0; r < idx.length; ++r) idx[r] = r;
           Utils.shuffleArray(idx, seed);
           for (int r = 0; r < idx.length; ++r) {
             for (int i = 0; i < ncs.length; i++) {
               ncs[i].addNum(cs[i].at0((int) idx[r]));
             }
           }
         }
       }.doAll(fr.numCols(), fr).outputFrame(outputFrameKey, fr.names(), fr.domains());
   return r;
 }
Пример #23
0
 @Test
 public void testQuantile() {
   Frame f = null;
   try {
     Frame fr =
         frame(
             ard(
                 ard(1.223292e-02),
                 ard(1.635312e-25),
                 ard(1.601522e-11),
                 ard(8.452298e-10),
                 ard(2.643733e-10),
                 ard(2.671520e-06),
                 ard(1.165381e-06),
                 ard(7.193265e-10),
                 ard(3.383532e-04),
                 ard(2.561221e-05)));
     double[] probs = new double[] {0.001, 0.005, .01, .02, .05, .10, .50, .8883, .90, .99};
     String x =
         String.format("(quantile %%%s %s \"interpolate\")", fr._key, Arrays.toString(probs));
     Val val = Exec.exec(x);
     fr.delete();
     f = val.getFrame();
     Assert.assertEquals(2, f.numCols());
     // Expected values computed as golden values from R's quantile call
     double[] exp =
         ard(
             1.4413698000016206E-13,
             7.206849000001562E-13,
             1.4413698000001489E-12,
             2.882739600000134E-12,
             7.20684900000009E-12,
             1.4413698000000017E-11,
             5.831131148999999E-07,
             3.3669567275300000E-04,
             0.00152780988,
             0.011162408988);
     for (int i = 0; i < exp.length; i++)
       Assert.assertTrue(
           "expected " + exp[i] + " got " + f.vec(1).at(i),
           water.util.MathUtils.compare(exp[i], f.vec(1).at(i), 1e-6, 1e-6));
   } finally {
     if (f != null) f.delete();
   }
 }
Пример #24
0
  // ==========================================================================
  public void basicGBM(String fname, String hexname, PrepData prep) {
    File file = TestUtil.find_test_file(fname);
    if (file == null) return; // Silently abort test if the file is missing
    Key fkey = NFSFileVec.make(file);
    Key dest = Key.make(hexname);
    GBM gbm = null;
    Frame fr = null;
    try {
      gbm = new GBM();
      gbm.source = fr = ParseDataset2.parse(dest, new Key[] {fkey});
      UKV.remove(fkey);
      int idx = prep.prep(fr);
      if (idx < 0) {
        gbm.classification = false;
        idx = ~idx;
      }
      String rname = fr._names[idx];
      gbm.response = fr.vecs()[idx];
      fr.remove(idx); // Move response to the end
      fr.add(rname, gbm.response);
      gbm.ntrees = 4;
      gbm.max_depth = 4;
      gbm.min_rows = 1;
      gbm.nbins = 50;
      gbm.cols = new int[fr.numCols()];
      for (int i = 0; i < gbm.cols.length; i++) gbm.cols[i] = i;
      gbm.learn_rate = .2f;
      gbm.invoke();

      fr = gbm.score(gbm.source);

      GBM.GBMModel gbmmodel = UKV.get(gbm.dest());
      // System.out.println(gbmmodel.toJava());

    } finally {
      UKV.remove(dest); // Remove original hex frame key
      if (gbm != null) {
        UKV.remove(gbm.dest()); // Remove the model
        UKV.remove(gbm.response._key);
        gbm.remove(); // Remove GBM Job
        if (fr != null) fr.remove();
      }
    }
  }
Пример #25
0
    private void applyTrainingFrameSideEffects() {
      int numCols = _modelBuilderTrain.numCols();
      String responseVecName = _modelBuilderTrain.names()[numCols - 1];
      Vec responseVec = _modelBuilderTrain.remove(numCols - 1);

      final boolean use_weights_column = (_parms.weights_column != null);
      final boolean use_start_column = (_parms.start_column != null);

      if (use_weights_column) {
        Vec weightsVec = _parms.weights_column;
        int idxInRawFrame = _train.find(weightsVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find weightVec");
        }

        String weightsVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(weightsVecName, weightsVec);
      }

      if (use_start_column) {
        Vec startVec = _parms.start_column;
        int idxInRawFrame = _train.find(startVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find startVec");
        }

        String startVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(startVecName, startVec);
      }

      {
        Vec stopVec = _parms.stop_column;
        int idxInRawFrame = _train.find(stopVec);
        if (idxInRawFrame < 0) {
          throw new RuntimeException("CoxPHDriver failed to find stopVec");
        }

        String stopVecName = _parms.train().names()[idxInRawFrame];
        _modelBuilderTrain.add(stopVecName, stopVec);
      }

      _modelBuilderTrain.add(responseVecName, responseVec);
    }
Пример #26
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    QuantileModel.QuantileParameters parms = new QuantileModel.QuantileParameters();
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    Frame fr_wkey = new Frame(fr); // Force a bogus Key for Quantiles ModelBuilder
    DKV.put(fr_wkey);
    parms._train = fr_wkey._key;

    parms._probs = ((ASTNumList) asts[2]).expand();
    for (double d : parms._probs)
      if (d < 0 || d > 1)
        throw new IllegalArgumentException("Probability must be between 0 and 1: " + d);

    String inter = asts[3].exec(env).getStr();
    parms._combine_method = QuantileModel.CombineMethod.valueOf(inter.toUpperCase());
    parms._weights_column = asts[4].str().equals("_") ? null : asts[4].str();

    // Compute Quantiles
    QuantileModel q = new Quantile(parms).trainModel().get();

    // Remove bogus Key
    DKV.remove(fr_wkey._key);

    // Reshape all outputs as a Frame, with probs in col 0 and the
    // quantiles in cols 1 thru fr.numCols() - except the optional weights vec
    int ncols = fr.numCols();
    if (parms._weights_column != null) ncols--;
    Vec[] vecs = new Vec[1 /*1 more for the probs themselves*/ + ncols];
    String[] names = new String[vecs.length];
    vecs[0] = Vec.makeCon(null, parms._probs);
    names[0] = "Probs";
    int w = 0;
    for (int i = 0; i < vecs.length - 1; ++i) {
      if (fr._names[i].equals(parms._weights_column)) w = 1;
      assert (w == 0 || w == 1);
      vecs[i + 1] = Vec.makeCon(null, q._output._quantiles[i]);
      names[i + 1] = fr._names[w + i] + "Quantiles";
    }
    q.delete();

    return new ValFrame(new Frame(names, vecs));
  }
Пример #27
0
  public ModelMetricsGLRM scoreMetricsOnly(Frame frame) {
    final int ncols = _output._names.length;

    // Need [A,X] where A = adapted test frame, X = loading frame
    // Note: A is adapted to original training frame
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();

    // Append loading frame X for calculating XY
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);

    GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm);
    ModelMetrics mm =
        gs._mb.makeModelMetrics(
            GLRMModel.this, adaptedFr, null, null); // save error metrics based on imputed data
    return (ModelMetricsGLRM) mm;
  }
Пример #28
0
  /**
   * Perform A'x operation with a DRM and an in-core Vector to create a new DRM.
   *
   * @param drmA DRM representing matrix A.
   * @param x in-core Mahout Vector.
   * @return new DRM containing A'x.
   */
  public static H2ODrm exec(H2ODrm drmA, Vector x) {
    Frame A = drmA.frame;
    final H2OBCast<Vector> bx = new H2OBCast<Vector>(x);

    // A'x is computed into atx[] with an MRTask on A (with
    // x available as a Broadcast
    //
    // x.size() == A.numRows()
    // atx.length == chks.length == A.numCols()
    class MRTaskAtx extends MRTask<MRTaskAtx> {
      double atx[];

      public void map(Chunk chks[]) {
        int chunkSize = chks[0].len();
        Vector x = bx.value();
        long start = chks[0].start();

        atx = new double[chks.length];
        for (int r = 0; r < chunkSize; r++) {
          double d = x.getQuick((int) start + r);
          for (int c = 0; c < chks.length; c++) {
            atx[c] += (chks[c].at0(r) * d);
          }
        }
      }

      public void reduce(MRTaskAtx other) {
        ArrayUtils.add(atx, other.atx);
      }
    }

    // Take the result in .atx[], and convert into a Frame
    // using existing helper functions (creating a Matrix
    // along the way for the Helper)
    Vector v = new DenseVector(new MRTaskAtx().doAll(A).atx);
    Matrix m = new DenseMatrix(A.numCols(), 1);
    m.assignColumn(0, v);
    return H2OHelper.drmFromMatrix(m, -1, -1);
  }
Пример #29
0
 /**
  * Initialize the ModelBuilder, validating all arguments and preparing the training frame. This
  * call is expected to be overridden in the subclasses and each subclass will start with
  * "super.init();".
  *
  * <p>Validate K, max_iterations and the number of rows.
  */
 @Override
 public void init(boolean expensive) {
   super.init(expensive);
   if (_parms._max_iterations < 0 || _parms._max_iterations > 1e6)
     error("_max_iterations", " max_iterations must be between 0 and 1e6");
   if (_train == null) return;
   if (_parms._init == Initialization.User && _parms._user_points == null)
     error("_user_y", "Must specify initial cluster centers");
   if (null != _parms._user_points) { // Check dimensions of user-specified centers
     Frame user_points = _parms._user_points.get();
     if (user_points.numCols() != _train.numCols() - numSpecialCols()) {
       error(
           "_user_y",
           "The user-specified points must have the same number of columns ("
               + (_train.numCols() - numSpecialCols())
               + ") as the training observations");
     } else if (user_points.numRows() != _parms._k)
       error(
           "_user_y",
           "The number of rows in the user-specified points is not equal to k = " + _parms._k);
   }
   if (expensive && error_count() == 0) checkMemoryFootPrint();
 }
Пример #30
0
 /**
  * Compute the L2 norm for each row of the frame
  *
  * @param fr Input frame
  * @return Vec containing L2 values for each row, is in K-V store
  */
 public static Vec getL2(final Frame fr, final double[] scale) {
   // add workspace vec at end
   final int idx = fr.numCols();
   assert (scale.length == idx) : "Mismatch for number of columns";
   fr.add("L2", fr.anyVec().makeZero());
   Vec res;
   try {
     new MRTask2() {
       @Override
       public void map(Chunk[] cs) {
         for (int r = 0; r < cs[0]._len; r++) {
           double norm2 = 0;
           for (int i = 0; i < idx; i++) norm2 += Math.pow(cs[i].at0(r) * scale[i], 2);
           cs[idx].set0(r, Math.sqrt(norm2));
         }
       }
     }.doAll(fr);
   } finally {
     res = fr.remove(idx);
   }
   res.rollupStats();
   return res;
 }