Пример #1
0
  /**
   * Computes the Q-value using the uncached transition dynamics produced by the Action object
   * methods. This computation *is* compatible with {@link
   * burlap.behavior.singleagent.options.Option} objects.
   *
   * @param sh the given state
   * @param ga the given action
   * @return the double value of a Q-value for the given state-aciton pair.
   */
  protected double computeQ(StateHashTuple sh, GroundedAction ga) {

    double q = 0.;

    if (ga.action instanceof Option) {

      Option o = (Option) ga.action;
      double expectedR = o.getExpectedRewards(sh.s, ga.params);
      q += expectedR;

      List<TransitionProbability> tps = o.getTransitions(sh.s, ga.params);
      for (TransitionProbability tp : tps) {
        double vp = this.value(tp.s);

        // note that for options, tp.p will be the *discounted* probability of transition to s',
        // so there is no need for a discount factor to be included
        q += tp.p * vp;
      }

    } else {

      List<TransitionProbability> tps = ga.action.getTransitions(sh.s, ga.params);
      for (TransitionProbability tp : tps) {
        double vp = this.value(tp.s);

        double discount = this.gamma;
        double r = rf.reward(sh.s, ga, tp.s);
        q += tp.p * (r + (discount * vp));
      }
    }

    return q;
  }
Пример #2
0
  /**
   * Returns the Q-value for a given set and the possible transitions from it for a given action.
   * This computation *is* compatible with {@link burlap.behavior.singleagent.options.Option}
   * objects.
   *
   * @param s the given state
   * @param trans the given action transitions
   * @return the double value of a Q-value
   */
  protected double computeQ(State s, ActionTransitions trans) {

    double q = 0.;

    if (trans.ga.action instanceof Option) {

      Option o = (Option) trans.ga.action;
      double expectedR = o.getExpectedRewards(s, trans.ga.params);
      q += expectedR;

      for (HashedTransitionProbability tp : trans.transitions) {

        double vp = this.value(tp.sh);

        // note that for options, tp.p will be the *discounted* probability of transition to s',
        // so there is no need for a discount factor to be included
        q += tp.p * vp;
      }

    } else {

      for (HashedTransitionProbability tp : trans.transitions) {

        double vp = this.value(tp.sh);

        double discount = this.gamma;
        double r = rf.reward(s, trans.ga, tp.sh.s);
        q += tp.p * (r + (discount * vp));
      }
    }

    return q;
  }