Ejemplo n.º 1
0
  @Override
  public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
    Frame f = stk.track(asts[1].exec(env)).getFrame();
    AstRoot axisAR = asts[2];
    for (Vec v : f.vecs()) {
      if (v.isCategorical() || v.isString() || v.isUUID())
        throw new IllegalArgumentException(
            "Cumulative functions not applicable to enum, string, or UUID values");
    }
    double axis = axisAR.exec(env).getNum();
    if (axis != 1.0 && axis != 0.0) throw new IllegalArgumentException("Axis must be 0 or 1");
    if (f.numCols() == 1) {
      if (axis == 0.0) {
        AstCumu.CumuTask t = new AstCumu.CumuTask(f.anyVec().nChunks(), init());
        t.doAll(new byte[] {Vec.T_NUM}, f.anyVec());
        final double[] chkCumu = t._chkCumu;
        Vec cumuVec = t.outputFrame().anyVec();
        new MRTask() {
          @Override
          public void map(Chunk c) {
            if (c.cidx() != 0) {
              double d = chkCumu[c.cidx() - 1];
              for (int i = 0; i < c._len; ++i) c.set(i, op(c.atd(i), d));
            }
          }
        }.doAll(cumuVec);
        return new ValFrame(new Frame(cumuVec));
      } else {
        return new ValFrame(new Frame(f));
      }
    } else {

      if (axis == 0.0) { // down the column implementation

        AstCumu.CumuTaskWholeFrame t =
            new AstCumu.CumuTaskWholeFrame(f.anyVec().nChunks(), init(), f.numCols());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        final double[][] chkCumu = t._chkCumu;
        new MRTask() {
          @Override
          public void map(Chunk cs[]) {
            if (cs[0].cidx() != 0) {
              for (int i = 0; i < cs.length; i++) {
                double d = chkCumu[i][cs[i].cidx() - 1];
                for (int j = 0; j < cs[i]._len; ++j) cs[i].set(j, op(cs[i].atd(j), d));
              }
            }
          }
        }.doAll(fr2);
        return new ValFrame(new Frame(fr2));

      } else {
        AstCumu.CumuTaskAxis1 t = new AstCumu.CumuTaskAxis1(init());
        Frame fr2 = t.doAll(f.numCols(), Vec.T_NUM, f).outputFrame(null, f.names(), null);
        return new ValFrame(new Frame(fr2));
      }
    }
  }
Ejemplo n.º 2
0
  @Override
  protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key) {
    Frame adaptFrm = new Frame(adaptedFr);
    for (int i = 0; i < _parms._k; i++)
      adaptFrm.add("PC" + String.valueOf(i + 1), adaptFrm.anyVec().makeZero());

    new MRTask() {
      @Override
      public void map(Chunk chks[]) {
        double tmp[] = new double[_output._names.length];
        double preds[] = new double[_parms._k];
        for (int row = 0; row < chks[0]._len; row++) {
          double p[] = score0(chks, row, tmp, preds);
          for (int c = 0; c < preds.length; c++) chks[_output._names.length + c].set(row, p[c]);
        }
      }
    }.doAll(adaptFrm);

    // Return the projection into principal component space
    int x = _output._names.length, y = adaptFrm.numCols();
    Frame f =
        adaptFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f =
        new Frame(
            (null == destination_key ? Key.make() : Key.make(destination_key)),
            f.names(),
            f.vecs());
    DKV.put(f);
    makeMetricBuilder(null).makeModelMetrics(this, orig);
    return f;
  }
Ejemplo n.º 3
0
 @Override
 public void map(Chunk[] ix, NewChunk[] ncs) {
   final Vec[] vecs = new Vec[_cols.length];
   final Vec anyv = _base.anyVec();
   final long nrow = anyv.length();
   long r = ix[0].at80(0);
   int last_ci = anyv.elem2ChunkIdx(r < nrow ? r : 0); // memoize the last chunk index
   long last_c0 = anyv._espc[last_ci]; // ...         last chunk start
   long last_c1 = anyv._espc[last_ci + 1]; // ...         last chunk end
   Chunk[] last_cs = new Chunk[vecs.length]; // ...         last chunks
   for (int c = 0; c < _cols.length; c++) {
     vecs[c] = _base.vecs()[_cols[c]];
     last_cs[c] = vecs[c].elem2BV(last_ci);
   }
   for (int i = 0; i < ix[0]._len; i++) {
     // select one row
     r = ix[0].at80(i) - 1; // next row to select
     if (r < 0) continue;
     if (r >= nrow) {
       for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN);
     } else {
       if (r < last_c0 || r >= last_c1) {
         last_ci = anyv.elem2ChunkIdx(r);
         last_c0 = anyv._espc[last_ci];
         last_c1 = anyv._espc[last_ci + 1];
         for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].elem2BV(last_ci);
       }
       for (int c = 0; c < vecs.length; c++) ncs[c].addNum(last_cs[c].at(r));
     }
   }
 }
Ejemplo n.º 4
0
 @Override
 public void onCompletion(CountedCompleter caller) {
   assert _out.numRows() == _in.numRows();
   assert _out.anyVec()._espc.length == (_nchunks + 1);
   _in.unlock(_jobKey);
   _out.update(_jobKey);
   _out.unlock(_jobKey);
 }
Ejemplo n.º 5
0
 /** Appends an entire Frame */
 public Frame add(Frame fr) {
   assert anyVec().group().equals(fr.anyVec().group());
   final int len0 = _names.length;
   final int len1 = fr._names.length;
   final int len = len0 + len1;
   _names = Arrays.copyOf(_names, len);
   _vecs = Arrays.copyOf(_vecs, len);
   _keys = Arrays.copyOf(_keys, len);
   System.arraycopy(fr._names, 0, _names, len0, len1);
   System.arraycopy(fr._vecs, 0, _vecs, len0, len1);
   System.arraycopy(fr._keys, 0, _keys, len0, len1);
   return this;
 }
Ejemplo n.º 6
0
 // Make vector templates for all output frame vectors
 private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
   Vec anyVec = dataset.anyVec();
   final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios);
   final int num = dataset.numCols(); // number of columns in input frame
   final int nsplits = espcPerSplit.length; // number of splits
   final String[][] domains = dataset.domains(); // domains
   Vec[][] t = new Vec[nsplits][ /*num*/]; // resulting vectors for all
   for (int i = 0; i < nsplits; i++) {
     // vectors for j-th split
     t[i] = new Vec(Vec.newKey(), espcPerSplit[i /*-th split*/]).makeZeros(num, domains);
   }
   return t;
 }
Ejemplo n.º 7
0
 @Override
 Val apply(Env env, Env.StackHelp stk, AST asts[]) {
   Frame fr = stk.track(asts[1].exec(env)).getFrame();
   if (fr.numCols() == 1 && fr.numRows() == 1) {
     if (fr.anyVec().isNumeric() || fr.anyVec().isBad()) return new ValNum(fr.anyVec().at(0));
     else if (fr.anyVec().isString())
       return new ValStr(fr.anyVec().atStr(new BufferedString(), 0).toString());
     return new ValStr(fr.domains()[0][(int) fr.anyVec().at8(0)]);
   }
   return new ValFrame(fr); // did not flatten
 }
Ejemplo n.º 8
0
  @Test
  public void testGroupbyTableSpeed() {
    Frame ids = parse_test_file(Key.make("cov"), "smalldata/junit/id_cols.csv");
    ids.replace(0, ids.anyVec().toCategoricalVec()).remove();
    System.out.println(ids.toString(0, 10));

    long start = System.currentTimeMillis();
    Val v_gb = Exec.exec("(GB cov [0] nrow 0 \"all\")");
    System.out.println("GB Time= " + (System.currentTimeMillis() - start) + "msec");
    System.out.println(v_gb.toString());
    ((ValFrame) v_gb)._fr.delete();

    long start2 = System.currentTimeMillis();
    Val v_tb = Exec.exec("(table cov FALSE)");
    System.out.println("Table Time= " + (System.currentTimeMillis() - start2) + "msec");
    System.out.println(v_tb.toString());
    ((ValFrame) v_tb)._fr.delete();

    ids.delete();
  }
Ejemplo n.º 9
0
 /**
  * Compute the L2 norm for each row of the frame
  *
  * @param fr Input frame
  * @return Vec containing L2 values for each row, is in K-V store
  */
 public static Vec getL2(final Frame fr, final double[] scale) {
   // add workspace vec at end
   final int idx = fr.numCols();
   assert (scale.length == idx) : "Mismatch for number of columns";
   fr.add("L2", fr.anyVec().makeZero());
   Vec res;
   try {
     new MRTask2() {
       @Override
       public void map(Chunk[] cs) {
         for (int r = 0; r < cs[0]._len; r++) {
           double norm2 = 0;
           for (int i = 0; i < idx; i++) norm2 += Math.pow(cs[i].at0(r) * scale[i], 2);
           cs[idx].set0(r, Math.sqrt(norm2));
         }
       }
     }.doAll(fr);
   } finally {
     res = fr.remove(idx);
   }
   res.rollupStats();
   return res;
 }
Ejemplo n.º 10
0
 @Override
 protected Frame rebalance(final Frame original_fr, boolean local, final String name) {
   if (original_fr == null) return null;
   if (_parms._force_load_balance) {
     int original_chunks = original_fr.anyVec().nChunks();
     _job.update(0, "Load balancing " + name.substring(name.length() - 5) + " data...");
     int chunks = desiredChunks(original_fr, local);
     if (!_parms._reproducible) {
       if (original_chunks >= chunks) {
         if (!_parms._quiet_mode)
           Log.info(
               "Dataset already contains " + original_chunks + " chunks. No need to rebalance.");
         return original_fr;
       }
     } else { // reproducible, set chunks to 1
       assert chunks == 1;
       if (!_parms._quiet_mode)
         Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
       if (original_chunks == 1) return original_fr;
     }
     if (!_parms._quiet_mode)
       Log.info(
           "Rebalancing "
               + name.substring(name.length() - 5)
               + " dataset into "
               + chunks
               + " chunks.");
     Key newKey = Key.make(name + ".chks" + chunks);
     RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks);
     H2O.submitTask(rb).join();
     Frame rebalanced_fr = DKV.get(newKey).get();
     Scope.track(rebalanced_fr);
     return rebalanced_fr;
   }
   return original_fr;
 }
Ejemplo n.º 11
0
  // GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell
  // (2015), Section 5.3)
  private Frame reconstruct(
      Frame orig,
      Frame adaptedFr,
      Key destination_key,
      boolean save_imputed,
      boolean reverse_transform) {
    final int ncols = _output._names.length;
    assert ncols == adaptedFr.numCols();
    String prefix = "reconstr_";

    // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
    // Note: A is adapted to original training frame, P has columns shuffled so cats come before
    // nums!
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);
    String[][] adaptedDomme = adaptedFr.domains();
    for (int i = 0; i < ncols; i++) {
      Vec v = fullFrm.anyVec().makeZero();
      v.setDomain(adaptedDomme[i]);
      fullFrm.add(prefix + _output._names[i], v);
    }
    GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);

    // Return the imputed training frame
    int x = ncols + _parms._k, y = fullFrm.numCols();
    Frame f =
        fullFrm.extractFrame(
            x, y); // this will call vec_impl() and we cannot call the delete() below just yet

    f = new Frame((null == destination_key ? Key.make() : destination_key), f.names(), f.vecs());
    DKV.put(f);
    gs._mb.makeModelMetrics(
        GLRMModel.this, orig, null, null); // save error metrics based on imputed data
    return f;
  }
Ejemplo n.º 12
0
    /**
     * Train a Deep Learning neural net model
     *
     * @param model Input model (e.g., from initModel(), or from a previous training run)
     * @return Trained model
     */
    public final DeepLearningModel trainModel(DeepLearningModel model) {
      Frame validScoreFrame = null;
      Frame train, trainScoreFrame;
      try {
        //      if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some
        // Job's params might be uninitialized (but the restarted model's parameters are correct)
        if (model == null) {
          model = DKV.get(dest()).get();
        }
        Log.info(
            "Model category: "
                + (_parms._autoencoder
                    ? "Auto-Encoder"
                    : isClassifier() ? "Classification" : "Regression"));
        final long model_size = model.model_info().size();
        Log.info(
            "Number of model parameters (weights/biases): " + String.format("%,d", model_size));
        model.write_lock(_job);
        _job.update(0, "Setting up training data...");
        final DeepLearningParameters mp = model.model_info().get_params();

        // temporary frames of the same "name" as the orig _train/_valid (asking the parameter's
        // Key, not the actual frame)
        // Note: don't put into DKV or they would overwrite the _train/_valid frames!
        Frame tra_fr = new Frame(mp._train, _train.names(), _train.vecs());
        Frame val_fr = _valid != null ? new Frame(mp._valid, _valid.names(), _valid.vecs()) : null;

        train = tra_fr;
        if (model._output.isClassifier() && mp._balance_classes) {
          _job.update(0, "Balancing class distribution of training data...");
          float[] trainSamplingFactors =
              new float
                  [train
                      .lastVec()
                      .domain()
                      .length]; // leave initialized to 0 -> will be filled up below
          if (mp._class_sampling_factors != null) {
            if (mp._class_sampling_factors.length != train.lastVec().domain().length)
              throw new IllegalArgumentException(
                  "class_sampling_factors must have "
                      + train.lastVec().domain().length
                      + " elements");
            trainSamplingFactors =
                mp._class_sampling_factors.clone(); // clone: don't modify the original
          }
          train =
              sampleFrameStratified(
                  train,
                  train.lastVec(),
                  train.vec(model._output.weightsName()),
                  trainSamplingFactors,
                  (long) (mp._max_after_balance_size * train.numRows()),
                  mp._seed,
                  true,
                  false);
          Vec l = train.lastVec();
          Vec w = train.vec(model._output.weightsName());
          MRUtils.ClassDist cd = new MRUtils.ClassDist(l);
          model._output._modelClassDist =
              _weights != null ? cd.doAll(l, w).rel_dist() : cd.doAll(l).rel_dist();
        }
        model.training_rows = train.numRows();
        if (_weights != null && _weights.min() == 0 && _weights.max() == 1 && _weights.isInt()) {
          model.training_rows = Math.round(train.numRows() * _weights.mean());
          Log.warn(
              "Not counting "
                  + (train.numRows() - model.training_rows)
                  + " rows with weight=0 towards an epoch.");
        }
        Log.info("One epoch corresponds to " + model.training_rows + " training data rows.");
        trainScoreFrame =
            sampleFrame(
                train,
                mp._score_training_samples,
                mp._seed); // training scoring dataset is always sampled uniformly from the training
                           // dataset
        if (trainScoreFrame != train) Scope.track(trainScoreFrame);

        if (!_parms._quiet_mode)
          Log.info("Number of chunks of the training data: " + train.anyVec().nChunks());
        if (val_fr != null) {
          model.validation_rows = val_fr.numRows();
          // validation scoring dataset can be sampled in multiple ways from the given validation
          // dataset
          if (model._output.isClassifier()
              && mp._balance_classes
              && mp._score_validation_sampling
                  == DeepLearningParameters.ClassSamplingMethod.Stratified) {
            _job.update(0, "Sampling validation data (stratified)...");
            validScoreFrame =
                sampleFrameStratified(
                    val_fr,
                    val_fr.lastVec(),
                    val_fr.vec(model._output.weightsName()),
                    null,
                    mp._score_validation_samples > 0
                        ? mp._score_validation_samples
                        : val_fr.numRows(),
                    mp._seed + 1,
                    false /* no oversampling */,
                    false);
          } else {
            _job.update(0, "Sampling validation data...");
            validScoreFrame = sampleFrame(val_fr, mp._score_validation_samples, mp._seed + 1);
            if (validScoreFrame != val_fr) Scope.track(validScoreFrame);
          }
          if (!_parms._quiet_mode)
            Log.info(
                "Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks());
        }

        // Set train_samples_per_iteration size (cannot be done earlier since this depends on
        // whether stratified sampling is done)
        model.actual_train_samples_per_iteration =
            computeTrainSamplesPerIteration(mp, model.training_rows, model);
        // Determine whether shuffling is enforced
        if (mp._replicate_training_data
            && (model.actual_train_samples_per_iteration
                == model.training_rows * (mp._single_node_mode ? 1 : H2O.CLOUD.size()))
            && !mp._shuffle_training_data
            && H2O.CLOUD.size() > 1
            && !mp._reproducible) {
          if (!mp._quiet_mode)
            Log.info(
                "Enabling training data shuffling, because all nodes train on the full dataset (replicated training data).");
          mp._shuffle_training_data = true;
        }
        if (!mp._shuffle_training_data
            && model.actual_train_samples_per_iteration == model.training_rows
            && train.anyVec().nChunks() == 1) {
          if (!mp._quiet_mode)
            Log.info(
                "Enabling training data shuffling to avoid training rows in the same order over and over (no Hogwild since there's only 1 chunk).");
          mp._shuffle_training_data = true;
        }

        //        if (!mp._quiet_mode) Log.info("Initial model:\n" + model.model_info());
        long now = System.currentTimeMillis();
        model._timeLastIterationEnter = now;
        if (_parms._autoencoder) {
          _job.update(0, "Scoring null model of autoencoder...");
          if (!mp._quiet_mode) Log.info("Scoring the null model of the autoencoder.");
          model.doScoring(
              trainScoreFrame,
              validScoreFrame,
              _job._key,
              0,
              false); // get the null model reconstruction error
        }
        // put the initial version of the model into DKV
        model.update(_job);
        model.total_setup_time_ms += now - _job.start_time();
        Log.info("Total setup time: " + PrettyPrint.msecs(model.total_setup_time_ms, true));
        Log.info("Starting to train the Deep Learning model.");
        _job.update(0, "Training...");

        // main loop
        for (; ; ) {
          model.iterations++;
          model.set_model_info(
              mp._epochs == 0
                  ? model.model_info()
                  : H2O.CLOUD.size() > 1 && mp._replicate_training_data
                      ? (mp._single_node_mode
                          ? new DeepLearningTask2(
                                  _job._key,
                                  train,
                                  model.model_info(),
                                  rowFraction(train, mp, model),
                                  model.iterations)
                              .doAll(Key.make(H2O.SELF))
                              .model_info()
                          : // replicated data + single node mode
                          new DeepLearningTask2(
                                  _job._key,
                                  train,
                                  model.model_info(),
                                  rowFraction(train, mp, model),
                                  model.iterations)
                              .doAllNodes()
                              .model_info())
                      : // replicated data + multi-node mode
                      new DeepLearningTask(
                              _job._key,
                              model.model_info(),
                              rowFraction(train, mp, model),
                              model.iterations)
                          .doAll(train)
                          .model_info()); // distributed data (always in multi-node mode)
          if (stop_requested() && !timeout()) break; // cancellation
          if (!model.doScoring(
              trainScoreFrame, validScoreFrame, _job._key, model.iterations, false))
            break; // finished training (or early stopping or convergence)
          if (timeout()) break; // stop after scoring
        }

        // replace the model with the best model so far (if it's better)
        if (!stop_requested()
            && _parms._overwrite_with_best_model
            && model.actual_best_model_key != null
            && _parms._nfolds == 0) {
          DeepLearningModel best_model = DKV.getGet(model.actual_best_model_key);
          if (best_model != null
              && best_model.loss() < model.loss()
              && Arrays.equals(best_model.model_info().units, model.model_info().units)) {
            if (!_parms._quiet_mode)
              Log.info("Setting the model to be the best model so far (based on scoring history).");
            DeepLearningModelInfo mi = best_model.model_info().deep_clone();
            // Don't cheat - count full amount of training samples, since that's the amount of
            // training it took to train (without finding anything better)
            mi.set_processed_global(model.model_info().get_processed_global());
            mi.set_processed_local(model.model_info().get_processed_local());
            model.set_model_info(mi);
            model.update(_job);
            model.doScoring(trainScoreFrame, validScoreFrame, _job._key, model.iterations, true);
            assert (best_model.loss() == model.loss());
          }
        }
        // store coefficient names for future use
        // possibly change
        model.model_info().data_info().coefNames();
        if (!_parms._quiet_mode) {
          Log.info(
              "==============================================================================================================================================================================");
          if (stop_requested()) {
            Log.info("Deep Learning model training was interrupted.");
          } else {
            Log.info("Finished training the Deep Learning model.");
            Log.info(model);
          }
          Log.info(
              "==============================================================================================================================================================================");
        }
      } finally {
        if (model != null) {
          model.deleteElasticAverageModels();
          model.unlock(_job);
          if (model.actual_best_model_key != null) {
            assert (model.actual_best_model_key != model._key);
            DKV.remove(model.actual_best_model_key);
          }
        }
      }
      return model;
    }
Ejemplo n.º 13
0
  @Override
  Val apply(Env env, Env.StackHelp stk, AST asts[]) {

    // Execute all args.  Find a canonical frame; all Frames must look like this one.
    // Each argument turns into either a Frame (whose rows are entirely
    // inlined) or a scalar (which is replicated across as a single row).
    Frame fr = null; // Canonical Frame; all frames have the same column count, types and names
    int nchks = 0; // Total chunks
    Val vals[] = new Val[asts.length]; // Computed AST results
    for (int i = 1; i < asts.length; i++) {
      vals[i] = stk.track(asts[i].exec(env));
      if (vals[i].isFrame()) {
        fr = vals[i].getFrame();
        nchks += fr.anyVec().nChunks(); // Total chunks
      } else nchks++; // One chunk per scalar
    }
    // No Frame, just a pile-o-scalars?
    Vec zz = null; // The zero-length vec for the zero-frame frame
    if (fr == null) { // Zero-length, 1-column, default name
      fr = new Frame(new String[] {Frame.defaultColName(0)}, new Vec[] {zz = Vec.makeZero(0)});
      if (asts.length == 1) return new ValFrame(fr);
    }

    // Verify all Frames are the same columns, names, and types.  Domains can vary, and will be the
    // union
    final Frame frs[] = new Frame[asts.length]; // Input frame
    final byte[] types = fr.types(); // Column types
    final int ncols = fr.numCols();
    final long[] espc = new long[nchks + 1]; // Compute a new layout!
    int coffset = 0;

    for (int i = 1; i < asts.length; i++) {
      Val val = vals[i]; // Save values computed for pass 2
      Frame fr0 =
          val.isFrame()
              ? val.getFrame()
              // Scalar: auto-expand into a 1-row frame
              : stk.track(new Frame(fr._names, Vec.makeCons(val.getNum(), 1L, fr.numCols())));

      // Check that all frames are compatible
      if (fr.numCols() != fr0.numCols())
        throw new IllegalArgumentException(
            "rbind frames must have all the same columns, found "
                + fr.numCols()
                + " and "
                + fr0.numCols()
                + " columns.");
      if (!Arrays.deepEquals(fr._names, fr0._names))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column names, found "
                + Arrays.toString(fr._names)
                + " and "
                + Arrays.toString(fr0._names));
      if (!Arrays.equals(types, fr0.types()))
        throw new IllegalArgumentException(
            "rbind frames must have all the same column types, found "
                + Arrays.toString(types)
                + " and "
                + Arrays.toString(fr0.types()));

      frs[i] = fr0; // Save frame

      // Roll up the ESPC row counts
      long roffset = espc[coffset];
      long[] espc2 = fr0.anyVec().espc();
      for (int j = 1; j < espc2.length; j++) // Roll up the row counts
      espc[coffset + j] = (roffset + espc2[j]);
      coffset += espc2.length - 1; // Chunk offset
    }
    if (zz != null) zz.remove();

    // build up the new domains for each vec
    HashMap<String, Integer>[] dmap = new HashMap[types.length];
    String[][] domains = new String[types.length][];
    int[][][] cmaps = new int[types.length][][];
    for (int k = 0; k < types.length; ++k) {
      dmap[k] = new HashMap<>();
      int c = 0;
      byte t = types[k];
      if (t == Vec.T_CAT) {
        int[][] maps = new int[frs.length][];
        for (int i = 1; i < frs.length; i++) {
          maps[i] = new int[frs[i].vec(k).domain().length];
          for (int j = 0; j < maps[i].length; j++) {
            String s = frs[i].vec(k).domain()[j];
            if (!dmap[k].containsKey(s)) dmap[k].put(s, maps[i][j] = c++);
            else maps[i][j] = dmap[k].get(s);
          }
        }
        cmaps[k] = maps;
      } else {
        cmaps[k] = new int[frs.length][];
      }
      domains[k] = c == 0 ? null : new String[c];
      for (Map.Entry<String, Integer> e : dmap[k].entrySet()) domains[k][e.getValue()] = e.getKey();
    }

    // Now make Keys for the new Vecs
    Key<Vec>[] keys = fr.anyVec().group().addVecs(fr.numCols());
    Vec[] vecs = new Vec[fr.numCols()];
    int rowLayout = Vec.ESPC.rowLayout(keys[0], espc);
    for (int i = 0; i < vecs.length; i++)
      vecs[i] = new Vec(keys[i], rowLayout, domains[i], types[i]);

    // Do the row-binds column-by-column.
    // Switch to F/J thread for continuations
    ParallelRbinds t;
    H2O.submitTask(t = new ParallelRbinds(frs, espc, vecs, cmaps)).join();
    return new ValFrame(new Frame(fr.names(), t._vecs));
  }
Ejemplo n.º 14
0
  public Frame deepSlice(Object orows, Object ocols) {
    // ocols is either a long[] or a Frame-of-1-Vec
    long[] cols;
    if (ocols == null) {
      cols = (long[]) ocols;
      assert cols == null;
    } else {
      if (ocols instanceof long[]) {
        cols = (long[]) ocols;
      } else if (ocols instanceof Frame) {
        Frame fr = (Frame) ocols;
        if (fr.numCols() != 1) {
          throw new IllegalArgumentException(
              "Columns Frame must have only one column (actually has "
                  + fr.numCols()
                  + " columns)");
        }

        long n = fr.anyVec().length();
        if (n > MAX_EQ2_COLS) {
          throw new IllegalArgumentException(
              "Too many requested columns (requested " + n + ", max " + MAX_EQ2_COLS + ")");
        }

        cols = new long[(int) n];
        Vec v = fr._vecs[0];
        for (long i = 0; i < v.length(); i++) {
          cols[(int) i] = v.at8(i);
        }
      } else {
        throw new IllegalArgumentException(
            "Columns is specified by an unsupported data type ("
                + ocols.getClass().getName()
                + ")");
      }
    }

    // Since cols is probably short convert to a positive list.
    int c2[] = null;
    if (cols == null) {
      c2 = new int[numCols()];
      for (int i = 0; i < c2.length; i++) c2[i] = i;
    } else if (cols.length == 0) {
      c2 = new int[0];
    } else if (cols[0] > 0) {
      c2 = new int[cols.length];
      for (int i = 0; i < cols.length; i++)
        c2[i] = (int) cols[i] - 1; // Convert 1-based cols to zero-based
    } else {
      c2 = new int[numCols() - cols.length];
      int j = 0;
      for (int i = 0; i < numCols(); i++) {
        if (j >= cols.length || i < (-cols[j] - 1)) c2[i - j] = i;
        else j++;
      }
    }
    for (int i = 0; i < c2.length; i++)
      if (c2[i] >= numCols())
        throw new IllegalArgumentException(
            "Trying to select column " + c2[i] + " but only " + numCols() + " present.");
    if (c2.length == 0)
      throw new IllegalArgumentException(
          "No columns selected (did you try to select column 0 instead of column 1?)");

    // Do Da Slice
    // orows is either a long[] or a Vec
    if (orows == null)
      return new DeepSlice((long[]) orows, c2)
          .doAll(c2.length, this)
          .outputFrame(names(c2), domains(c2));
    else if (orows instanceof long[]) {
      final long CHK_ROWS = 1000000;
      long[] rows = (long[]) orows;
      if (rows.length == 0)
        return new DeepSlice(rows, c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2));
      if (rows[0] < 0)
        return new DeepSlice(rows, c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2));
      // Vec'ize the index array
      AppendableVec av = new AppendableVec("rownames");
      int r = 0;
      int c = 0;
      while (r < rows.length) {
        NewChunk nc = new NewChunk(av, c);
        long end = Math.min(r + CHK_ROWS, rows.length);
        for (; r < end; r++) {
          nc.addNum(rows[r]);
        }
        nc.close(c++, null);
      }
      Vec c0 = av.close(null); // c0 is the row index vec
      Frame fr2 =
          new Slice(c2, this)
              .doAll(c2.length, new Frame(new String[] {"rownames"}, new Vec[] {c0}))
              .outputFrame(names(c2), domains(c2));
      UKV.remove(c0._key); // Remove hidden vector
      return fr2;
    }
    Frame frows = (Frame) orows;
    Vec vrows = frows.anyVec();
    // It's a compatible Vec; use it as boolean selector.
    // Build column names for the result.
    Vec[] vecs = new Vec[c2.length + 1];
    String[] names = new String[c2.length + 1];
    for (int i = 0; i < c2.length; ++i) {
      vecs[i] = _vecs[c2[i]];
      names[i] = _names[c2[i]];
    }
    vecs[c2.length] = vrows;
    names[c2.length] = "predicate";
    return new DeepSelect()
        .doAll(c2.length, new Frame(names, vecs))
        .outputFrame(names(c2), domains(c2));
  }