Esempio n. 1
0
  public void updatePolicy(Policy policy, double discount) {
    double change;
    double maxError = MAX_ERROR * (1 - discount) / discount;
    do {
      change = 0;
      // Bellman step
      Utility u = new Utility(this);
      for (int y = 0; y < rows; y++) {
        for (int x = 0; x < cols; x++) {
          Pos p = new Pos(x, y);
          double reward = env.getRewardForPos(p);
          if (env.isTerminalState(p)) {
            util[y][x] = reward;
          } else {
            util[y][x] = reward + discount * policy.getAction(p).getProbableReward(p, u, env);
          }
        }
      }

      // get most changed
      for (int y = 0; y < rows; y++) {
        for (int x = 0; x < cols; x++) {
          double diff = Math.abs(util[y][x] - u.util[y][x]);
          if (diff > change) {
            change = diff;
          }
        }
      }
    } while (change > maxError);
  }