Пример #1
0
  public Action policy(State observation, float reward) {

    Map<Float, Action> options = new HashMap<Float, Action>();
    scala.collection.Iterator<Action> it = actions().iterator();
    while (it.hasNext()) {
      Action action = it.next();
      options.put(expectedReward(observation, action), action);
    }

    Action bestGuess = options.get(new TreeSet<Float>(options.keySet()).last());

    Action decision = bestGuess;

    if (random.nextDouble() < randomFactor) {
      decision = randomAction();
    }

    Situation currentSA = new Situation(observation, decision);

    float currentExpected = expectedReward(observation, decision);

    float lastExpected = expectedReward(lastSA.getKey(), lastSA.getValue());

    float newQValue = lastExpected + alpha * (reward + gamma * currentExpected - lastExpected);

    qTable.put(lastSA, newQValue); // update the Q-Table
    lastSA = currentSA;

    return decision;
  }