public Action policy(State observation, float reward) { Map<Float, Action> options = new HashMap<Float, Action>(); scala.collection.Iterator<Action> it = actions().iterator(); while (it.hasNext()) { Action action = it.next(); options.put(expectedReward(observation, action), action); } Action bestGuess = options.get(new TreeSet<Float>(options.keySet()).last()); Action decision = bestGuess; if (random.nextDouble() < randomFactor) { decision = randomAction(); } Situation currentSA = new Situation(observation, decision); float currentExpected = expectedReward(observation, decision); float lastExpected = expectedReward(lastSA.getKey(), lastSA.getValue()); float newQValue = lastExpected + alpha * (reward + gamma * currentExpected - lastExpected); qTable.put(lastSA, newQValue); // update the Q-Table lastSA = currentSA; return decision; }