Пример #1
0
 @Override
 protected void execImpl() {
   Vec va = null, vp = null, avp = null;
   try {
     if (classification) {
       // Create a new vectors - it is cheap since vector are only adaptation vectors
       va = vactual.toEnum(); // always returns TransfVec
       actual_domain = va._domain;
       vp = vpredict.toEnum(); // always returns TransfVec
       predicted_domain = vp._domain;
       if (!Arrays.equals(actual_domain, predicted_domain)) {
         domain = Utils.domainUnion(actual_domain, predicted_domain);
         int[][] vamap = Model.getDomainMapping(domain, actual_domain, true);
         va = TransfVec.compose((TransfVec) va, vamap, domain, false); // delete original va
         int[][] vpmap = Model.getDomainMapping(domain, predicted_domain, true);
         vp = TransfVec.compose((TransfVec) vp, vpmap, domain, false); // delete original vp
       } else domain = actual_domain;
       // The vectors are from different groups => align them, but properly delete it after
       // computation
       if (!va.group().equals(vp.group())) {
         avp = vp;
         vp = va.align(vp);
       }
       cm = new CM(domain.length).doAll(va, vp)._cm;
     } else {
       mse = new CM(1).doAll(vactual, vpredict).mse();
     }
     return;
   } finally { // Delete adaptation vectors
     if (va != null) UKV.remove(va._key);
     if (vp != null) UKV.remove(vp._key);
     if (avp != null) UKV.remove(avp._key);
   }
 }
Пример #2
0
  private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) {
    try {
      if (fs == null) return;
      for (FileStatus file : fs.listStatus(p)) {
        Path pfs = file.getPath();
        if (file.isDir()) {
          addFolder(fs, pfs, succeeded, failed);
        } else {
          Key k = Key.make(pfs.toString());
          long size = file.getLen();
          Value val = null;
          if (pfs.getName().endsWith(Extensions.JSON)) {
            JsonParser parser = new JsonParser();
            JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject();
            JsonElement v = json.get(Constants.VERSION);
            if (v == null) throw new InvalidDataException("Missing version");
            JsonElement type = json.get(Constants.TYPE);
            if (type == null) throw new InvalidDataException("Missing type");
            Class c = Class.forName(type.getAsString());
            Model model = (Model) c.newInstance();
            model.fromJson(json);
          } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
            FSDataInputStream s = fs.open(pfs);
            int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg
            byte[] mem = MemoryManager.malloc1(sz);
            s.readFully(mem);
            // Convert to a ValueArray (hope it fits in 1Meg!)
            ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem));
            val = new Value(k, ary, Value.HDFS);
          } else if (size >= 2 * ValueArray.CHUNK_SZ) {
            val =
                new Value(
                    k,
                    new ValueArray(k, size),
                    Value.HDFS); // ValueArray byte wrapper over a large file
          } else {
            val = new Value(k, (int) size, Value.HDFS); // Plain Value
          }
          val.setdsk();
          DKV.put(k, val);

          JsonObject o = new JsonObject();
          o.addProperty(Constants.KEY, k.toString());
          o.addProperty(Constants.FILE, pfs.toString());
          o.addProperty(Constants.VALUE_SIZE, file.getLen());
          succeeded.add(o);
        }
      }
    } catch (Exception e) {
      Log.err(e);
      JsonObject o = new JsonObject();
      o.addProperty(Constants.FILE, p.toString());
      o.addProperty(Constants.ERROR, e.getMessage());
      failed.add(o);
    }
  }
Пример #3
0
 public static Key buildKey(Model model, Frame frame) {
   return makeKey(
       "modelmetrics_" + model.getUniqueId().getUuid() + "_on_" + frame.getUniqueId().getUuid());
 }
Пример #4
0
  /**
   * The train/valid Frame instances are sorted by categorical (themselves sorted by cardinality
   * greatest to least) with all numerical columns following. The response column(s) are placed at
   * the end.
   *
   * <p>Interactions: 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) 2. Num-Enum 3. Enum-Enum
   *
   * <p>Interactions are produced on the fly and are dense (in all 3 cases). Consumers of DataInfo
   * should not have to care how these interactions are generated. Any heuristic using the fullN
   * value should continue functioning the same.
   *
   * <p>Interactions are specified in two ways: A. As a list of pairs of column indices. B. As a
   * list of pairs of column indices with limited enums.
   */
  public DataInfo(
      Frame train,
      Frame valid,
      int nResponses,
      boolean useAllFactorLevels,
      TransformType predictor_transform,
      TransformType response_transform,
      boolean skipMissing,
      boolean imputeMissing,
      boolean missingBucket,
      boolean weight,
      boolean offset,
      boolean fold,
      Model.InteractionPair[] interactions) {
    super(Key.<DataInfo>make());
    _valid = valid != null;
    assert predictor_transform != null;
    assert response_transform != null;
    _offset = offset;
    _weights = weight;
    _fold = fold;
    assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true";
    _skipMissing = skipMissing;
    _imputeMissing = imputeMissing;
    _predictor_transform = predictor_transform;
    _response_transform = response_transform;
    _responses = nResponses;
    _useAllFactorLevels = useAllFactorLevels;
    _interactions = interactions;

    // create dummy InteractionWrappedVecs and shove them onto the front
    if (_interactions != null) {
      _interactionVecs = new int[_interactions.length];
      train =
          Model.makeInteractions(
                  train,
                  false,
                  _interactions,
                  _useAllFactorLevels,
                  _skipMissing,
                  predictor_transform == TransformType.STANDARDIZE)
              .add(train);
      if (valid != null)
        valid =
            Model.makeInteractions(
                    valid,
                    true,
                    _interactions,
                    _useAllFactorLevels,
                    _skipMissing,
                    predictor_transform == TransformType.STANDARDIZE)
                .add(valid); // FIXME: should be using the training subs/muls!
    }

    _permutation = new int[train.numCols()];
    final Vec[] tvecs = train.vecs();

    // Count categorical-vs-numerical
    final int n = tvecs.length - _responses - (offset ? 1 : 0) - (weight ? 1 : 0) - (fold ? 1 : 0);
    int[] nums = MemoryManager.malloc4(n);
    int[] cats = MemoryManager.malloc4(n);
    int nnums = 0, ncats = 0;
    for (int i = 0; i < n; ++i)
      if (tvecs[i].isCategorical()) cats[ncats++] = i;
      else nums[nnums++] = i;

    _nums = nnums;
    _cats = ncats;
    _catLvls = new int[ncats][];

    // sort the cats in the decreasing order according to their size
    for (int i = 0; i < ncats; ++i)
      for (int j = i + 1; j < ncats; ++j)
        if (tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length) {
          int x = cats[i];
          cats[i] = cats[j];
          cats[j] = x;
        }
    String[] names = new String[train.numCols()];
    Vec[] tvecs2 = new Vec[train.numCols()];

    // Compute the cardinality of each cat
    _catModes = new int[ncats];
    _catOffsets = MemoryManager.malloc4(ncats + 1);
    _catMissing = new boolean[ncats];
    int len = _catOffsets[0] = 0;
    int interactionIdx = 0; // simple index into the _interactionVecs array

    ArrayList<Integer> interactionIds;
    if (_interactions == null) {
      interactionIds = new ArrayList<>();
      for (int i = 0; i < tvecs.length; ++i)
        if (tvecs[i] instanceof InteractionWrappedVec) {
          interactionIds.add(i);
        }
      _interactionVecs = new int[interactionIds.size()];
      for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i);
    }
    for (int i = 0; i < ncats; ++i) {
      names[i] = train._names[cats[i]];
      Vec v = (tvecs2[i] = tvecs[cats[i]]);
      _catMissing[i] = missingBucket; // needed for test time
      if (v instanceof InteractionWrappedVec) {
        if (_interactions != null) _interactions[interactionIdx].vecIdx = i;
        _interactionVecs[interactionIdx++] =
            i; // i (and not cats[i]) because this is the index in _adaptedFrame
        _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0));
      } else
        _catOffsets[i + 1] =
            (len +=
                v.domain().length
                    - (useAllFactorLevels ? 0 : 1)
                    + (missingBucket ? 1 : 0)); // missing values turn into a new factor level
      _catModes[i] =
          imputeMissing ? imputeCat(train.vec(cats[i])) : _catMissing[i] ? v.domain().length : -100;
      _permutation[i] = cats[i];
    }
    _numMeans = new double[nnums];
    _numOffsets = MemoryManager.malloc4(nnums + 1);
    _numOffsets[0] = len;
    boolean isIWV; // is InteractionWrappedVec?
    for (int i = 0; i < nnums; ++i) {
      names[i + ncats] = train._names[nums[i]];
      Vec v = train.vec(nums[i]);
      tvecs2[i + ncats] = v;
      isIWV = v instanceof InteractionWrappedVec;
      if (isIWV) {
        if (null != _interactions) _interactions[interactionIdx].vecIdx = i + ncats;
        _interactionVecs[interactionIdx++] = i + ncats;
      }
      _numOffsets[i + 1] = (len += (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1));
      _numMeans[i] = train.vec(nums[i]).mean();
      _permutation[i + ncats] = nums[i];
    }
    for (int i = names.length - nResponses - (weight ? 1 : 0) - (offset ? 1 : 0) - (fold ? 1 : 0);
        i < names.length;
        ++i) {
      names[i] = train._names[i];
      tvecs2[i] = train.vec(i);
    }
    _adaptedFrame = new Frame(names, tvecs2);
    train.restructure(names, tvecs2);
    if (valid != null) valid.restructure(names, valid.vecs(names));
    //    _adaptedFrame = train;

    setPredictorTransform(predictor_transform);
    if (_responses > 0) setResponseTransform(response_transform);
  }