Exemplo n.º 1
0
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold) {
    super(Key.<DataInfo>make());
    _valid = false;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;
    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[_cats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[_cats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new int[ncats];
    int len = _catOffsets[0] = 0;
    for (int i = 0; i < ncats; ++i) {
      _catModes[i] = imputeCat(train.vec(cats[i]));
      _permutation[i] = cats[i];
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket ? 1 : 0; // needed for test time
      _catOffsets[i + 1] =
          (len +=
              v.domain().length
                  - (useAllFactorLevels ? 0 : 1)
                  + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
    }
    _numMeans = new double[_nums];
    for (int i = 0; i < _nums; ++i) {
      names[i + _cats] = train._names[nums[i]];
      tvecs2[i + _cats] = train.vec(nums[i]);
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + _cats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }
Exemplo n.º 2
0
  /**
   * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality
   * greatest to least) with all numerical columns following. The response column(s) are placed at
   * the end.
   *
   * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum
   *
   * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo
   * should not have to care how these interactions are generated. Any heuristic using the fullN
   * value should continue functioning the same.
   *
   * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a
   * list of pairs of column indices with limited enums.
   */
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold,
      Model.InteractionPair[] interactions) {
    super(Key.<DataInfo>make());
    _valid = valid != null;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _interactions = interactions;

    // create dummy InteractionWrappedVecs and shove them onto the front
    if (_interactions != null) {
      _interactionVecs = new int[_interactions.length];
      train =
          Model.makeInteractions(
                  train,
                  false,
                  _interactions,
                  _useAllFactorLevels,
                  _skipMissing,
                  predictor_transform == TransformType.STANDARDIZE)
              .add(train);
      if (valid != null)
        valid =
            Model.makeInteractions(
                    valid,
                    true,
                    _interactions,
                    _useAllFactorLevels,
                    _skipMissing,
                    predictor_transform == TransformType.STANDARDIZE)
                .add(valid); // FIXME: should be using the training subs/muls!
    }

    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;

    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[ncats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[ncats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new boolean[ncats];
    int len = _catOffsets[0] = 0;
    int interactionIdx = 0; // simple index into the _interactionVecs array

    ArrayList<Integer> interactionIds;
    if (_interactions == null) {
      interactionIds = new ArrayList<>();
      for (int i = 0; i < tvecs.length; ++i)
        if (tvecs[i] instanceof InteractionWrappedVec) {
          interactionIds.add(i);
        }
      _interactionVecs = new int[interactionIds.size()];
      for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i);
    }
    for (int i = 0; i < ncats; ++i) {
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket; // needed for test time
      if (v instanceof InteractionWrappedVec) {
        if (_interactions != null) _interactions[interactionIdx].vecIdx = i;
        _interactionVecs[interactionIdx++] =
            i; // i (and not cats[i]) because this is the index in _adaptedFrame
        _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0));
      } else
        _catOffsets[i + 1] =
            (len +=
                v.domain().length
                    - (useAllFactorLevels ? 0 : 1)
                    + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
      _catModes[i] =
          imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100;
      _permutation[i] = cats[i];
    }
    _numMeans = new double[nnums];
    _numOffsets = MemoryManager.malloc4(nnums + 1);
    _numOffsets[0] = len;
    boolean isIWV; // is InteractionWrappedVec?
    for (int i = 0; i < nnums; ++i) {
      names[i + ncats] = train._names[nums[i]];
      Vec v = train.vec(nums[i]);
      tvecs2[i + ncats] = v;
      isIWV = v instanceof InteractionWrappedVec;
      if (isIWV) {
        if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats;
        _interactionVecs[interactionIdx++] = i + ncats;
      }
      _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1));
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + ncats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }