Ejemplo n.º 1
0
  public void updatePolicy(Policy policy, double discount) {
    double change;
    double maxError = MAX_ERROR * (1 - discount) / discount;
    do {
      change = 0;
      // Bellman step
      Utility u = new Utility(this);
      for (int y = 0; y < rows; y++) {
        for (int x = 0; x < cols; x++) {
          Pos p = new Pos(x, y);
          double reward = env.getRewardForPos(p);
          if (env.isTerminalState(p)) {
            util[y][x] = reward;
          } else {
            util[y][x] = reward + discount * policy.getAction(p).getProbableReward(p, u, env);
          }
        }
      }

      // get most changed
      for (int y = 0; y < rows; y++) {
        for (int x = 0; x < cols; x++) {
          double diff = Math.abs(util[y][x] - u.util[y][x]);
          if (diff > change) {
            change = diff;
          }
        }
      }
    } while (change > maxError);
  }
Ejemplo n.º 2
0
 public Utility(Environment env) {
   this.rows = env.rows;
   this.cols = env.cols;
   this.env = env;
   util = new double[env.rows][env.cols];
   for (int y = 0; y < env.rows; y++) {
     for (int x = 0; x < env.cols; x++) {
       util[y][x] = env.getRewardForPos(new Pos(x, y));
     }
   }
 }