Пример #1
0
 @Test
 public void testDomains() {
   Frame frame = parse_test_file("smalldata/junit/weather.csv");
   for (String s : new String[] {"MaxWindSpeed", "RelHumid9am", "Cloud9am"}) {
     Vec v = frame.vec(s);
     Vec newV = v.toCategoricalVec();
     frame.remove(s);
     frame.add(s, newV);
     v.remove();
   }
   DKV.put(frame);
   AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
   parms._train = frame._key;
   parms._radius_scale = 10;
   AggregatorModel agg = new Aggregator(parms).trainModel().get();
   Frame output = agg._output._output_frame.get();
   Assert.assertTrue(output.numRows() < 0.5 * frame.numRows());
   boolean same = true;
   for (int i = 0; i < frame.numCols(); ++i) {
     if (frame.vec(i).isCategorical()) {
       same = (frame.domains()[i].length == output.domains()[i].length);
       if (!same) break;
     }
   }
   frame.remove();
   output.remove();
   agg.remove();
   Assert.assertFalse(same);
 }
Пример #2
0
 /**
  * Sample rows from a frame. Can be unlucky for small sampling fractions - will continue calling
  * itself until at least 1 row is returned.
  *
  * @param fr Input frame
  * @param rows Approximate number of rows to sample (across all chunks)
  * @param seed Seed for RNG
  * @return Sampled frame
  */
 public static Frame sampleFrame(Frame fr, final long rows, final long seed) {
   if (fr == null) return null;
   final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f;
   if (fraction >= 1.f) return fr;
   Frame r =
       new MRTask2() {
         @Override
         public void map(Chunk[] cs, NewChunk[] ncs) {
           final Random rng = getDeterRNG(seed + cs[0].cidx());
           int count = 0;
           for (int r = 0; r < cs[0]._len; r++)
             if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) {
               count++;
               for (int i = 0; i < ncs.length; i++) {
                 ncs[i].addNum(cs[i].at0(r));
               }
             }
         }
       }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains());
   if (r.numRows() == 0) {
     Log.warn(
         "You asked for "
             + rows
             + " rows (out of "
             + fr.numRows()
             + "), but you got none (seed="
             + seed
             + ").");
     Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\"");
     return sampleFrame(fr, rows, seed + 1);
   }
   return r;
 }
Пример #3
0
  /**
   * Project each archetype into original feature space
   *
   * @param frame Original training data with m rows and n columns
   * @param destination_key Frame Id for output
   * @return Frame containing k rows and n columns, where each row corresponds to an archetype
   */
  public Frame scoreArchetypes(Frame frame, Key destination_key, boolean reverse_transform) {
    final int ncols = _output._names.length;
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();
    String[][] adaptedDomme = adaptedFr.domains();
    double[][] proj = new double[_parms._k][_output._nnums + _output._ncats];

    // Categorical columns
    for (int d = 0; d < _output._ncats; d++) {
      double[][] block = _output._archetypes_raw.getCatBlock(d);
      for (int k = 0; k < _parms._k; k++)
        proj[k][_output._permutation[d]] = _parms.mimpute(block[k], _output._lossFunc[d]);
    }

    // Numeric columns
    for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) {
      int ds = d - _output._ncats;
      for (int k = 0; k < _parms._k; k++) {
        double num = _output._archetypes_raw.getNum(ds, k);
        proj[k][_output._permutation[d]] = _parms.impute(num, _output._lossFunc[d]);
        if (reverse_transform)
          proj[k][_output._permutation[d]] =
              proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
      }
    }

    // Convert projection of archetypes into a frame with correct domains
    Frame f =
        ArrayUtils.frame(
            (null == destination_key ? Key.make() : destination_key), adaptedFr.names(), proj);
    for (int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]);
    return f;
  }
Пример #4
0
 @Override
 public void compute2() {
   _in.read_lock(_jobKey);
   // simply create a bogus new vector (don't even put it into KV) with appropriate number of lines
   // per chunk and then use it as a source to do multiple makeZero calls
   // to create empty vecs and than call RebalanceTask on each one of them.
   // RebalanceTask will fetch the appropriate src chunks and fetch the data from them.
   int rpc = (int) (_in.numRows() / _nchunks);
   int rem = (int) (_in.numRows() % _nchunks);
   long[] espc = new long[_nchunks + 1];
   Arrays.fill(espc, rpc);
   for (int i = 0; i < rem; ++i) ++espc[i];
   long sum = 0;
   for (int i = 0; i < espc.length; ++i) {
     long s = espc[i];
     espc[i] = sum;
     sum += s;
   }
   assert espc[espc.length - 1] == _in.numRows()
       : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1];
   final Vec[] srcVecs = _in.vecs();
   _out =
       new Frame(
           _okey,
           _in.names(),
           new Vec(Vec.newKey(), espc).makeZeros(srcVecs.length, _in.domains()));
   _out.delete_and_lock(_jobKey);
   new RebalanceTask(this, srcVecs).asyncExec(_out);
 }
Пример #5
0
 @Override
 Val apply(Env env, Env.StackHelp stk, AST asts[]) {
   Frame fr = stk.track(asts[1].exec(env)).getFrame();
   if (fr.numCols() == 1 && fr.numRows() == 1) {
     if (fr.anyVec().isNumeric() || fr.anyVec().isBad()) return new ValNum(fr.anyVec().at(0));
     else if (fr.anyVec().isString())
       return new ValStr(fr.anyVec().atStr(new BufferedString(), 0).toString());
     return new ValStr(fr.domains()[0][(int) fr.anyVec().at8(0)]);
   }
   return new ValFrame(fr); // did not flatten
 }
Пример #6
0
 // Make vector templates for all output frame vectors
 private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
   Vec anyVec = dataset.anyVec();
   final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios);
   final int num = dataset.numCols(); // number of columns in input frame
   final int nsplits = espcPerSplit.length; // number of splits
   final String[][] domains = dataset.domains(); // domains
   Vec[][] t = new Vec[nsplits][ /*num*/]; // resulting vectors for all
   for (int i = 0; i < nsplits; i++) {
     // vectors for j-th split
     t[i] = new Vec(Vec.newKey(), espcPerSplit[i /*-th split*/]).makeZeros(num, domains);
   }
   return t;
 }
Пример #7
0
 public static Frame shuffleFramePerChunk(Key outputFrameKey, Frame fr, final long seed) {
   Frame r =
       new MRTask2() {
         @Override
         public void map(Chunk[] cs, NewChunk[] ncs) {
           long[] idx = new long[cs[0]._len];
           for (int r = 0; r < idx.length; ++r) idx[r] = r;
           Utils.shuffleArray(idx, seed);
           for (int r = 0; r < idx.length; ++r) {
             for (int i = 0; i < ncs.length; i++) {
               ncs[i].addNum(cs[i].at0((int) idx[r]));
             }
           }
         }
       }.doAll(fr.numCols(), fr).outputFrame(outputFrameKey, fr.names(), fr.domains());
   return r;
 }
Пример #8
0
  // GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell
  // (2015), Section 5.3)
  private Frame reconstruct(
      Frame orig,
      Frame adaptedFr,
      Key destination_key,
      boolean save_imputed,
      boolean reverse_transform) {
    final int ncols = _output._names.length;
    assert ncols == adaptedFr.numCols();
    String prefix = "reconstr_";

    // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
    // Note: A is adapted to original training frame, P has columns shuffled so cats come before
    // nums!
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);
    String[][] adaptedDomme = adaptedFr.domains();
    for (int i = 0; i < ncols; i++) {
      Vec v = fullFrm.anyVec().makeZero();
      v.setDomain(adaptedDomme[i]);
      fullFrm.add(prefix + _output._names[i], v);
    }
    GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);

    // Return the imputed training frame
    int x = ncols + _parms._k, y = fullFrm.numCols();
    Frame f =
        fullFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs());
    DKV.put(f);
    gs._mb.makeModelMetrics(
        GLRMModel.this, orig, null, null); // save error metrics based on imputed data
    return f;
  }
Пример #9
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    Frame returningFrame;
    long nrows = fr.numRows();
    if (asts[2] instanceof ASTNumList) {
      final ASTNumList nums = (ASTNumList) asts[2];
      long[] rows = nums._isList ? nums.expand8Sort() : null;
      if (rows != null) {
        if (rows.length == 0) { // Empty inclusion list?
        } else if (rows[0] >= 0) { // Positive (inclusion) list
          if (rows[rows.length - 1] > nrows)
            throw new IllegalArgumentException("Row must be an integer from 0 to " + (nrows - 1));
        } else { // Negative (exclusion) list
          // Invert the list to make a positive list, ignoring out-of-bounds values
          BitSet bs = new BitSet((int) nrows);
          for (int i = 0; i < rows.length; i++) {
            int idx = (int) (-rows[i] - 1); // The positive index
            if (idx >= 0 && idx < nrows) bs.set(idx); // Set column to EXCLUDE
          }
          rows = new long[(int) nrows - bs.cardinality()];
          for (int i = bs.nextClearBit(0), j = 0; i < nrows; i = bs.nextClearBit(i + 1))
            rows[j++] = i;
        }
      }
      final long[] ls = rows;

      returningFrame =
          new MRTask() {
            @Override
            public void map(Chunk[] cs, NewChunk[] ncs) {
              if (nums.cnt() == 0) return;
              long start = cs[0].start();
              long end = start + cs[0]._len;
              long min = ls == null ? (long) nums.min() : ls[0],
                  max =
                      ls == null
                          ? (long) nums.max() - 1
                          : ls[ls.length - 1]; // exclusive max to inclusive max when stride == 1
              //     [ start, ...,  end ]     the chunk
              // 1 []                          nums out left:  nums.max() < start
              // 2                         []  nums out rite:  nums.min() > end
              // 3 [ nums ]                    nums run left:  nums.min() < start && nums.max() <=
              // end
              // 4          [ nums ]           nums run in  :  start <= nums.min() && nums.max() <=
              // end
              // 5                   [ nums ]  nums run rite:  start <= nums.min() && end <
              // nums.max()
              if (!(max < start || min > end)) { // not situation 1 or 2 above
                long startOffset = (min > start ? min : start); // situation 4 and 5 => min > start;
                for (int i = (int) (startOffset - start); i < cs[0]._len; ++i) {
                  if ((ls == null && nums.has(start + i))
                      || (ls != null && Arrays.binarySearch(ls, start + i) >= 0)) {
                    for (int c = 0; c < cs.length; ++c) {
                      if (cs[c] instanceof CStrChunk) ncs[c].addStr(cs[c], i);
                      else if (cs[c] instanceof C16Chunk) ncs[c].addUUID(cs[c], i);
                      else if (cs[c].isNA(i)) ncs[c].addNA();
                      else ncs[c].addNum(cs[c].atd(i));
                    }
                  }
                }
              }
            }
          }.doAll(fr.types(), fr).outputFrame(fr.names(), fr.domains());
    } else if ((asts[2] instanceof ASTNum)) {
      long[] rows = new long[] {(long) (((ASTNum) asts[2])._v.getNum())};
      returningFrame = fr.deepSlice(rows, null);
    } else if ((asts[2] instanceof ASTExec) || (asts[2] instanceof ASTId)) {
      Frame predVec = stk.track(asts[2].exec(env)).getFrame();
      if (predVec.numCols() != 1)
        throw new IllegalArgumentException(
            "Conditional Row Slicing Expression evaluated to "
                + predVec.numCols()
                + " columns.  Must be a boolean Vec.");
      returningFrame = fr.deepSlice(predVec, null);
    } else
      throw new IllegalArgumentException(
          "Row slicing requires a number-list as the last argument, but found a "
              + asts[2].getClass());
    return new ValFrame(returningFrame);
  }
Пример #10
0
  // internal version with repeat counter
  // currently hardcoded to do up to 10 tries to get a row from each class, which can be impossible
  // for certain wrong sampling ratios
  private static Frame sampleFrameStratified(
      final Frame fr,
      Vec label,
      final float[] sampling_ratios,
      final long seed,
      final boolean debug,
      int count) {
    if (fr == null) return null;
    assert (label.isEnum());
    assert (sampling_ratios != null && sampling_ratios.length == label.domain().length);
    final int labelidx = fr.find(label); // which column is the label?
    assert (labelidx >= 0);

    final boolean poisson = false; // beta feature

    Frame r =
        new MRTask2() {
          @Override
          public void map(Chunk[] cs, NewChunk[] ncs) {
            final Random rng = getDeterRNG(seed + cs[0].cidx());
            for (int r = 0; r < cs[0]._len; r++) {
              if (cs[labelidx].isNA0(r)) continue; // skip missing labels
              final int label = (int) cs[labelidx].at80(r);
              assert (sampling_ratios.length > label && label >= 0);
              int sampling_reps;
              if (poisson) {
                sampling_reps = Utils.getPoisson(sampling_ratios[label], rng);
              } else {
                final float remainder = sampling_ratios[label] - (int) sampling_ratios[label];
                sampling_reps =
                    (int) sampling_ratios[label] + (rng.nextFloat() < remainder ? 1 : 0);
              }
              for (int i = 0; i < ncs.length; i++) {
                for (int j = 0; j < sampling_reps; ++j) {
                  ncs[i].addNum(cs[i].at0(r));
                }
              }
            }
          }
        }.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains());

    // Confirm the validity of the distribution
    long[] dist = new ClassDist(r.vecs()[labelidx]).doAll(r.vecs()[labelidx]).dist();

    // if there are no training labels in the test set, then there is no point in sampling the test
    // set
    if (dist == null) return fr;

    if (debug) {
      long sumdist = Utils.sum(dist);
      Log.info("After stratified sampling: " + sumdist + " rows.");
      for (int i = 0; i < dist.length; ++i) {
        Log.info(
            "Class "
                + r.vecs()[labelidx].domain(i)
                + ": count: "
                + dist[i]
                + " sampling ratio: "
                + sampling_ratios[i]
                + " actual relative frequency: "
                + (float) dist[i] / sumdist * dist.length);
      }
    }

    // Re-try if we didn't get at least one example from each class
    if (Utils.minValue(dist) == 0 && count < 10) {
      Log.info(
          "Re-doing stratified sampling because not all classes were represented (unlucky draw).");
      r.delete();
      return sampleFrameStratified(fr, label, sampling_ratios, seed + 1, debug, ++count);
    }

    // shuffle intra-chunk
    Frame shuffled = shuffleFramePerChunk(r, seed + 0x580FF13);
    r.delete();

    return shuffled;
  }