/** * Computes the Q-value using the uncached transition dynamics produced by the Action object * methods. This computation *is* compatible with {@link * burlap.behavior.singleagent.options.Option} objects. * * @param sh the given state * @param ga the given action * @return the double value of a Q-value for the given state-aciton pair. */ protected double computeQ(StateHashTuple sh, GroundedAction ga) { double q = 0.; if (ga.action instanceof Option) { Option o = (Option) ga.action; double expectedR = o.getExpectedRewards(sh.s, ga.params); q += expectedR; List<TransitionProbability> tps = o.getTransitions(sh.s, ga.params); for (TransitionProbability tp : tps) { double vp = this.value(tp.s); // note that for options, tp.p will be the *discounted* probability of transition to s', // so there is no need for a discount factor to be included q += tp.p * vp; } } else { List<TransitionProbability> tps = ga.action.getTransitions(sh.s, ga.params); for (TransitionProbability tp : tps) { double vp = this.value(tp.s); double discount = this.gamma; double r = rf.reward(sh.s, ga, tp.s); q += tp.p * (r + (discount * vp)); } } return q; }
/** * Returns the Q-value for a given set and the possible transitions from it for a given action. * This computation *is* compatible with {@link burlap.behavior.singleagent.options.Option} * objects. * * @param s the given state * @param trans the given action transitions * @return the double value of a Q-value */ protected double computeQ(State s, ActionTransitions trans) { double q = 0.; if (trans.ga.action instanceof Option) { Option o = (Option) trans.ga.action; double expectedR = o.getExpectedRewards(s, trans.ga.params); q += expectedR; for (HashedTransitionProbability tp : trans.transitions) { double vp = this.value(tp.sh); // note that for options, tp.p will be the *discounted* probability of transition to s', // so there is no need for a discount factor to be included q += tp.p * vp; } } else { for (HashedTransitionProbability tp : trans.transitions) { double vp = this.value(tp.sh); double discount = this.gamma; double r = rf.reward(s, trans.ga, tp.sh.s); q += tp.p * (r + (discount * vp)); } } return q; }