Example #1
0
  @Override
  public EpisodeAnalysis runLearningEpisodeFrom(State initialState, int maxSteps) {

    EpisodeAnalysis ea = new EpisodeAnalysis(initialState);

    State curState = initialState;
    int steps = 0;
    while (!this.tf.isTerminal(curState) && steps < maxSteps) {
      GroundedAction ga = (GroundedAction) policy.getAction(curState);
      State nextState = ga.executeIn(curState);
      double r = this.rf.reward(curState, ga, nextState);

      ea.recordTransitionTo(nextState, ga, r);

      this.model.updateModel(curState, ga, nextState, r, this.tf.isTerminal(nextState));

      this.modelPlanner.performBellmanUpdateOn(curState);

      curState = nextState;
      steps++;
    }

    return ea;
  }
Example #2
0
  @Override
  public EpisodeAnalysis runLearningEpisodeFrom(State initialState, int maxSteps) {

    this.toggleShouldAnnotateOptionDecomposition(shouldAnnotateOptions);

    EpisodeAnalysis ea = new EpisodeAnalysis(initialState);

    StateHashTuple curState = this.stateHash(initialState);
    eStepCounter = 0;

    maxQChangeInLastEpisode = 0.;

    while (!tf.isTerminal(curState.s) && eStepCounter < maxSteps) {

      GroundedAction action = (GroundedAction) learningPolicy.getAction(curState.s);
      QValue curQ = this.getQ(curState, action);

      StateHashTuple nextState = this.stateHash(action.executeIn(curState.s));
      double maxQ = 0.;

      if (!tf.isTerminal(nextState.s)) {
        maxQ = this.getMaxQ(nextState);
      }

      // manage option specifics
      double r = 0.;
      double discount = this.gamma;
      if (action.action.isPrimitive()) {
        r = rf.reward(curState.s, action, nextState.s);
        eStepCounter++;
        ea.recordTransitionTo(nextState.s, action, r);
      } else {
        Option o = (Option) action.action;
        r = o.getLastCumulativeReward();
        int n = o.getLastNumSteps();
        discount = Math.pow(this.gamma, n);
        eStepCounter += n;
        if (this.shouldDecomposeOptions) {
          ea.appendAndMergeEpisodeAnalysis(o.getLastExecutionResults());
        } else {
          ea.recordTransitionTo(nextState.s, action, r);
        }
      }

      double oldQ = curQ.q;

      // update Q-value
      curQ.q =
          curQ.q
              + this.learningRate.pollLearningRate(curState.s, action)
                  * (r + (discount * maxQ) - curQ.q);

      double deltaQ = Math.abs(oldQ - curQ.q);
      if (deltaQ > maxQChangeInLastEpisode) {
        maxQChangeInLastEpisode = deltaQ;
      }

      // move on
      curState = nextState;
    }

    if (episodeHistory.size() >= numEpisodesToStore) {
      episodeHistory.poll();
    }
    episodeHistory.offer(ea);

    return ea;
  }