public void updatePolicy(Policy policy, double discount) { double change; double maxError = MAX_ERROR * (1 - discount) / discount; do { change = 0; // Bellman step Utility u = new Utility(this); for (int y = 0; y < rows; y++) { for (int x = 0; x < cols; x++) { Pos p = new Pos(x, y); double reward = env.getRewardForPos(p); if (env.isTerminalState(p)) { util[y][x] = reward; } else { util[y][x] = reward + discount * policy.getAction(p).getProbableReward(p, u, env); } } } // get most changed for (int y = 0; y < rows; y++) { for (int x = 0; x < cols; x++) { double diff = Math.abs(util[y][x] - u.util[y][x]); if (diff > change) { change = diff; } } } } while (change > maxError); }
public Utility(Environment env) { this.rows = env.rows; this.cols = env.cols; this.env = env; util = new double[env.rows][env.cols]; for (int y = 0; y < env.rows; y++) { for (int x = 0; x < env.cols; x++) { util[y][x] = env.getRewardForPos(new Pos(x, y)); } } }