@Override public EpisodeAnalysis runLearningEpisodeFrom(State initialState, int maxSteps) { EpisodeAnalysis ea = new EpisodeAnalysis(initialState); State curState = initialState; int steps = 0; while (!this.tf.isTerminal(curState) && steps < maxSteps) { GroundedAction ga = (GroundedAction) policy.getAction(curState); State nextState = ga.executeIn(curState); double r = this.rf.reward(curState, ga, nextState); ea.recordTransitionTo(nextState, ga, r); this.model.updateModel(curState, ga, nextState, r, this.tf.isTerminal(nextState)); this.modelPlanner.performBellmanUpdateOn(curState); curState = nextState; steps++; } return ea; }
@Override public EpisodeAnalysis runLearningEpisodeFrom(State initialState, int maxSteps) { this.toggleShouldAnnotateOptionDecomposition(shouldAnnotateOptions); EpisodeAnalysis ea = new EpisodeAnalysis(initialState); StateHashTuple curState = this.stateHash(initialState); eStepCounter = 0; maxQChangeInLastEpisode = 0.; while (!tf.isTerminal(curState.s) && eStepCounter < maxSteps) { GroundedAction action = (GroundedAction) learningPolicy.getAction(curState.s); QValue curQ = this.getQ(curState, action); StateHashTuple nextState = this.stateHash(action.executeIn(curState.s)); double maxQ = 0.; if (!tf.isTerminal(nextState.s)) { maxQ = this.getMaxQ(nextState); } // manage option specifics double r = 0.; double discount = this.gamma; if (action.action.isPrimitive()) { r = rf.reward(curState.s, action, nextState.s); eStepCounter++; ea.recordTransitionTo(nextState.s, action, r); } else { Option o = (Option) action.action; r = o.getLastCumulativeReward(); int n = o.getLastNumSteps(); discount = Math.pow(this.gamma, n); eStepCounter += n; if (this.shouldDecomposeOptions) { ea.appendAndMergeEpisodeAnalysis(o.getLastExecutionResults()); } else { ea.recordTransitionTo(nextState.s, action, r); } } double oldQ = curQ.q; // update Q-value curQ.q = curQ.q + this.learningRate.pollLearningRate(curState.s, action) * (r + (discount * maxQ) - curQ.q); double deltaQ = Math.abs(oldQ - curQ.q); if (deltaQ > maxQChangeInLastEpisode) { maxQChangeInLastEpisode = deltaQ; } // move on curState = nextState; } if (episodeHistory.size() >= numEpisodesToStore) { episodeHistory.poll(); } episodeHistory.offer(ea); return ea; }