/** * The policy is learned here * * @param sweeps = number of sweeps executed * @throws OptimizationException */ public void learn(int sweeps) throws OptimizationException { largestDif = 0.0; // temporary table to store new V-values StateRepV newV = new StateRepV(0.0, false); for (int i = 0; i < sweeps; i++) { double diff = 0.0; // for each state for (int state = 0; state < StateRepV.nrStates; state++) { // solve the set of equations double[] values = solveEquations(state); // calculate difference in v-value if (state != 0) { diff = Math.abs(vValues.getV(state) - values[Action.nrActions]); if (diff > largestDif) { largestDif = diff; } newV.setValue(state, values[Action.nrActions]); } // repair values returned by solving the equations if neccessary for (int a = 0; a < Action.nrActions; a++) { if (values[a] < 0.00000001) { policy.setValue(state, Action.getAction(a), 0.00000001); } else if (values[a] > 0.99999999) { policy.setValue(state, Action.getAction(a), 1.0); } else { policy.setValue(state, Action.getAction(a), values[a]); } } } // put new values in V-table vValues = newV; newV = new StateRepV(init, false); } }