Exemplo n.º 1
0
 private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) {
   try {
     if (fs == null) return;
     for (FileStatus file : fs.listStatus(p)) {
       Path pfs = file.getPath();
       if (file.isDir()) {
         addFolder(fs, pfs, succeeded, failed);
       } else {
         Key k = Key.make(pfs.toString());
         long size = file.getLen();
         Value val = null;
         if (pfs.getName().endsWith(Extensions.JSON)) {
           JsonParser parser = new JsonParser();
           JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject();
           JsonElement v = json.get(Constants.VERSION);
           if (v == null) throw new RuntimeException("Missing version");
           JsonElement type = json.get(Constants.TYPE);
           if (type == null) throw new RuntimeException("Missing type");
           Class c = Class.forName(type.getAsString());
           OldModel model = (OldModel) c.newInstance();
           model.fromJson(json);
         } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
           FSDataInputStream s = fs.open(pfs);
           int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg
           byte[] mem = MemoryManager.malloc1(sz);
           s.readFully(mem);
           // Convert to a ValueArray (hope it fits in 1Meg!)
           ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem));
           val = new Value(k, ary, Value.HDFS);
         } else if (size >= 2 * ValueArray.CHUNK_SZ) {
           val =
               new Value(
                   k,
                   new ValueArray(k, size),
                   Value.HDFS); // ValueArray byte wrapper over a large file
         } else {
           val = new Value(k, (int) size, Value.HDFS); // Plain Value
           val.setdsk();
         }
         DKV.put(k, val);
         Log.info("PersistHdfs: DKV.put(" + k + ")");
         JsonObject o = new JsonObject();
         o.addProperty(Constants.KEY, k.toString());
         o.addProperty(Constants.FILE, pfs.toString());
         o.addProperty(Constants.VALUE_SIZE, file.getLen());
         succeeded.add(o);
       }
     }
   } catch (Exception e) {
     Log.err(e);
     JsonObject o = new JsonObject();
     o.addProperty(Constants.FILE, p.toString());
     o.addProperty(Constants.ERROR, e.getMessage());
     failed.add(o);
   }
 }
 @Override
 protected boolean chunkInit() {
   final int n_coef = _beta.length;
   sumWeightedCatX = MemoryManager.malloc8d(n_coef - (_dinfo._nums - _n_offsets));
   sumWeightedNumX = MemoryManager.malloc8d(_dinfo._nums);
   sizeRiskSet = MemoryManager.malloc8d(_n_time);
   sizeCensored = MemoryManager.malloc8d(_n_time);
   sizeEvents = MemoryManager.malloc8d(_n_time);
   countEvents = MemoryManager.malloc8(_n_time);
   sumRiskEvents = MemoryManager.malloc8d(_n_time);
   sumLogRiskEvents = MemoryManager.malloc8d(_n_time);
   rcumsumRisk = MemoryManager.malloc8d(_n_time);
   sumXEvents = malloc2DArray(_n_time, n_coef);
   sumXRiskEvents = malloc2DArray(_n_time, n_coef);
   rcumsumXRisk = malloc2DArray(_n_time, n_coef);
   sumXXRiskEvents = malloc3DArray(_n_time, n_coef, n_coef);
   rcumsumXXRisk = malloc3DArray(_n_time, n_coef, n_coef);
   return true;
 }
Exemplo n.º 3
0
 @Override
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   final Path p;
   if (_iceRoot != null) {
     p = new Path(_iceRoot, getIceName(v));
   } else {
     // Convert an arraylet chunk into a long-offset from the base file.
     if (k._kb[0] == Key.ARRAYLET_CHUNK) {
       skip = ValueArray.getChunkOffset(k); // The offset
       k = ValueArray.getArrayKey(k); // From the base file key
       if (k.toString().endsWith(Extensions.HEX)) { // Hex file?
         int value_len = DKV.get(k).memOrLoad().length; // How long is the ValueArray header?
         skip += value_len;
       }
     }
     p = new Path(k.toString());
   }
   final long skip_ = skip;
   run(
       new Callable() {
         @Override
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
             Utils.close(s);
           }
           return null;
         }
       },
       true,
       v._max);
   return b;
 }
Exemplo n.º 4
0
 @Override
 public byte[] load(final Value v) {
   final byte[] b = MemoryManager.malloc1(v._max);
   long skip = 0;
   Key k = v._key;
   if (k._kb[0] == Key.ARRAYLET_CHUNK) {
     skip = ValueArray.getChunkOffset(k); // The offset
     k = ValueArray.getArrayKey(k); // From the base file key
   } else if (k._kb[0] == Key.DVEC) {
     skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
   }
   final Path p =
       _iceRoot == null ? new Path(getPathForKey(k)) : new Path(_iceRoot, getIceName(v));
   final long skip_ = skip;
   run(
       new Callable() {
         @Override
         public Object call() throws Exception {
           FileSystem fs = FileSystem.get(p.toUri(), CONF);
           FSDataInputStream s = null;
           try {
             s = fs.open(p);
             // NOTE:
             // The following line degrades performance of HDFS load from S3 API:
             // s.readFully(skip,b,0,b.length);
             // Google API's simple seek has better performance
             // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same
             // condition)
             ByteStreams.skipFully(s, skip_);
             ByteStreams.readFully(s, b);
             assert v.isPersisted();
           } finally {
             Utils.close(s);
           }
           return null;
         }
       },
       true,
       v._max);
   return b;
 }
 private static double[][][] malloc3DArray(final int d1, final int d2, final int d3) {
   final double[][][] array = new double[d1][d2][];
   for (int j = 0; j < d1; ++j)
     for (int k = 0; k < d2; ++k) array[j][k] = MemoryManager.malloc8d(d3);
   return array;
 }
 private static double[][] malloc2DArray(final int d1, final int d2) {
   final double[][] array = new double[d1][];
   for (int j = 0; j < d1; ++j) array[j] = MemoryManager.malloc8d(d2);
   return array;
 }
    @Override
    protected void compute2() {
      CoxPHModel model = null;
      try {
        Scope.enter();
        _parms.read_lock_frames(CoxPH.this);
        init(true);

        applyScoringFrameSideEffects();

        // The model to be built
        model = new CoxPHModel(dest(), _parms, new CoxPHModel.CoxPHOutput(CoxPH.this));
        model.delete_and_lock(_key);

        applyTrainingFrameSideEffects();

        int nResponses = 1;
        boolean useAllFactorLevels = false;
        final DataInfo dinfo =
            new DataInfo(
                Key.make(),
                _modelBuilderTrain,
                null,
                nResponses,
                useAllFactorLevels,
                DataInfo.TransformType.DEMEAN,
                TransformType.NONE,
                true,
                false,
                false,
                false,
                false,
                false);
        initStats(model, dinfo);

        final int n_offsets =
            (model._parms.offset_columns == null) ? 0 : model._parms.offset_columns.length;
        final int n_coef = dinfo.fullN() - n_offsets;
        final double[] step = MemoryManager.malloc8d(n_coef);
        final double[] oldCoef = MemoryManager.malloc8d(n_coef);
        final double[] newCoef = MemoryManager.malloc8d(n_coef);
        Arrays.fill(step, Double.NaN);
        Arrays.fill(oldCoef, Double.NaN);
        for (int j = 0; j < n_coef; ++j) newCoef[j] = model._parms.init;
        double oldLoglik = -Double.MAX_VALUE;
        final int n_time = (int) (model._output.max_time - model._output.min_time + 1);
        final boolean has_start_column = (model._parms.start_column != null);
        final boolean has_weights_column = (model._parms.weights_column != null);
        for (int i = 0; i <= model._parms.iter_max; ++i) {
          model._output.iter = i;

          final CoxPHTask coxMR =
              new CoxPHTask(
                      self(),
                      dinfo,
                      newCoef,
                      model._output.min_time,
                      n_time,
                      n_offsets,
                      has_start_column,
                      has_weights_column)
                  .doAll(dinfo._adaptedFrame);

          final double newLoglik = calcLoglik(model, coxMR);
          if (newLoglik > oldLoglik) {
            if (i == 0) calcCounts(model, coxMR);

            calcModelStats(model, newCoef, newLoglik);
            calcCumhaz_0(model, coxMR);

            if (newLoglik == 0) model._output.lre = -Math.log10(Math.abs(oldLoglik - newLoglik));
            else model._output.lre = -Math.log10(Math.abs((oldLoglik - newLoglik) / newLoglik));
            if (model._output.lre >= model._parms.lre_min) break;

            Arrays.fill(step, 0);
            for (int j = 0; j < n_coef; ++j)
              for (int k = 0; k < n_coef; ++k)
                step[j] -= model._output.var_coef[j][k] * model._output.gradient[k];
            for (int j = 0; j < n_coef; ++j)
              if (Double.isNaN(step[j]) || Double.isInfinite(step[j])) break;

            oldLoglik = newLoglik;
            System.arraycopy(newCoef, 0, oldCoef, 0, oldCoef.length);
          } else {
            for (int j = 0; j < n_coef; ++j) step[j] /= 2;
          }

          for (int j = 0; j < n_coef; ++j) newCoef[j] = oldCoef[j] - step[j];
        }

        model.update(_key);
      } catch (Throwable t) {
        Job thisJob = DKV.getGet(_key);
        if (thisJob._state == JobState.CANCELLED) {
          Log.info("Job cancelled by user.");
        } else {
          t.printStackTrace();
          failed(t);
          throw t;
        }
      } finally {
        updateModelOutput();
        _parms.read_unlock_frames(CoxPH.this);
        Scope.exit();
        done(); // Job done!
      }
      tryComplete();
    }
    protected double calcLoglik(CoxPHModel model, final CoxPHTask coxMR) {
      CoxPHModel.CoxPHParameters p = model._parms;
      CoxPHModel.CoxPHOutput o = model._output;

      final int n_coef = o.coef.length;
      final int n_time = coxMR.sizeEvents.length;
      double newLoglik = 0;
      for (int j = 0; j < n_coef; ++j) o.gradient[j] = 0;
      for (int j = 0; j < n_coef; ++j) for (int k = 0; k < n_coef; ++k) o.hessian[j][k] = 0;

      switch (p.ties) {
        case efron:
          final double[] newLoglik_t = MemoryManager.malloc8d(n_time);
          final double[][] gradient_t = malloc2DArray(n_time, n_coef);
          final double[][][] hessian_t = malloc3DArray(n_time, n_coef, n_coef);
          ForkJoinTask[] fjts = new ForkJoinTask[n_time];
          for (int t = n_time - 1; t >= 0; --t) {
            final int _t = t;
            fjts[t] =
                new RecursiveAction() {
                  @Override
                  protected void compute() {
                    final double sizeEvents_t = coxMR.sizeEvents[_t];
                    if (sizeEvents_t > 0) {
                      final long countEvents_t = coxMR.countEvents[_t];
                      final double sumLogRiskEvents_t = coxMR.sumLogRiskEvents[_t];
                      final double sumRiskEvents_t = coxMR.sumRiskEvents[_t];
                      final double rcumsumRisk_t = coxMR.rcumsumRisk[_t];
                      final double avgSize = sizeEvents_t / countEvents_t;
                      newLoglik_t[_t] = sumLogRiskEvents_t;
                      System.arraycopy(coxMR.sumXEvents[_t], 0, gradient_t[_t], 0, n_coef);
                      for (long e = 0; e < countEvents_t; ++e) {
                        final double frac = ((double) e) / ((double) countEvents_t);
                        final double term = rcumsumRisk_t - frac * sumRiskEvents_t;
                        newLoglik_t[_t] -= avgSize * Math.log(term);
                        for (int j = 0; j < n_coef; ++j) {
                          final double djTerm =
                              coxMR.rcumsumXRisk[_t][j] - frac * coxMR.sumXRiskEvents[_t][j];
                          final double djLogTerm = djTerm / term;
                          gradient_t[_t][j] -= avgSize * djLogTerm;
                          for (int k = 0; k < n_coef; ++k) {
                            final double dkTerm =
                                coxMR.rcumsumXRisk[_t][k] - frac * coxMR.sumXRiskEvents[_t][k];
                            final double djkTerm =
                                coxMR.rcumsumXXRisk[_t][j][k]
                                    - frac * coxMR.sumXXRiskEvents[_t][j][k];
                            hessian_t[_t][j][k] -=
                                avgSize * (djkTerm / term - (djLogTerm * (dkTerm / term)));
                          }
                        }
                      }
                    }
                  }
                };
          }
          ForkJoinTask.invokeAll(fjts);

          for (int t = 0; t < n_time; ++t) newLoglik += newLoglik_t[t];

          for (int t = 0; t < n_time; ++t)
            for (int j = 0; j < n_coef; ++j) o.gradient[j] += gradient_t[t][j];

          for (int t = 0; t < n_time; ++t)
            for (int j = 0; j < n_coef; ++j)
              for (int k = 0; k < n_coef; ++k) o.hessian[j][k] += hessian_t[t][j][k];
          break;
        case breslow:
          for (int t = n_time - 1; t >= 0; --t) {
            final double sizeEvents_t = coxMR.sizeEvents[t];
            if (sizeEvents_t > 0) {
              final double sumLogRiskEvents_t = coxMR.sumLogRiskEvents[t];
              final double rcumsumRisk_t = coxMR.rcumsumRisk[t];
              newLoglik += sumLogRiskEvents_t;
              newLoglik -= sizeEvents_t * Math.log(rcumsumRisk_t);
              for (int j = 0; j < n_coef; ++j) {
                final double dlogTerm = coxMR.rcumsumXRisk[t][j] / rcumsumRisk_t;
                o.gradient[j] += coxMR.sumXEvents[t][j];
                o.gradient[j] -= sizeEvents_t * dlogTerm;
                for (int k = 0; k < n_coef; ++k)
                  o.hessian[j][k] -=
                      sizeEvents_t
                          * (((coxMR.rcumsumXXRisk[t][j][k] / rcumsumRisk_t)
                              - (dlogTerm * (coxMR.rcumsumXRisk[t][k] / rcumsumRisk_t))));
              }
            }
          }
          break;
        default:
          throw new IllegalArgumentException("ties method must be either efron or breslow");
      }
      return newLoglik;
    }
    protected void initStats(final CoxPHModel model, final DataInfo dinfo) {
      CoxPHModel.CoxPHParameters p = model._parms;
      CoxPHModel.CoxPHOutput o = model._output;

      o.n = p.stop_column.length();
      o.data_info = dinfo;
      final int n_offsets = (p.offset_columns == null) ? 0 : p.offset_columns.length;
      final int n_coef = o.data_info.fullN() - n_offsets;
      final String[] coefNames = o.data_info.coefNames();
      o.coef_names = new String[n_coef];
      System.arraycopy(coefNames, 0, o.coef_names, 0, n_coef);
      o.coef = MemoryManager.malloc8d(n_coef);
      o.exp_coef = MemoryManager.malloc8d(n_coef);
      o.exp_neg_coef = MemoryManager.malloc8d(n_coef);
      o.se_coef = MemoryManager.malloc8d(n_coef);
      o.z_coef = MemoryManager.malloc8d(n_coef);
      o.gradient = MemoryManager.malloc8d(n_coef);
      o.hessian = malloc2DArray(n_coef, n_coef);
      o.var_coef = malloc2DArray(n_coef, n_coef);
      o.x_mean_cat = MemoryManager.malloc8d(n_coef - (o.data_info._nums - n_offsets));
      o.x_mean_num = MemoryManager.malloc8d(o.data_info._nums - n_offsets);
      o.mean_offset = MemoryManager.malloc8d(n_offsets);
      o.offset_names = new String[n_offsets];
      System.arraycopy(coefNames, n_coef, o.offset_names, 0, n_offsets);

      final Vec start_column = p.start_column;
      final Vec stop_column = p.stop_column;
      o.min_time =
          p.start_column == null ? (long) stop_column.min() : (long) start_column.min() + 1;
      o.max_time = (long) stop_column.max();

      final int n_time = new Vec.CollectDomain().doAll(stop_column).domain().length;
      o.time = MemoryManager.malloc8(n_time);
      o.n_risk = MemoryManager.malloc8d(n_time);
      o.n_event = MemoryManager.malloc8d(n_time);
      o.n_censor = MemoryManager.malloc8d(n_time);
      o.cumhaz_0 = MemoryManager.malloc8d(n_time);
      o.var_cumhaz_1 = MemoryManager.malloc8d(n_time);
      o.var_cumhaz_2 = malloc2DArray(n_time, n_coef);
    }