/** * computes a hash-backed policy for every state visited along the greedy path of the UCT tree. */ public void computePolicyFromTree() { policy = new HashMap<StateHashTuple, GroundedAction>(); if (this.planner.getRoot() == null) { return; } // define policy for all states that are expanded along the greedy path of the UCT tree LinkedList<UCTStateNode> queue = new LinkedList<UCTStateNode>(); queue.add(planner.getRoot()); while (queue.size() > 0) { UCTStateNode snode = queue.poll(); if (!planner.containsActionPreference(snode)) { System.out.println( "UCT tree does not contain action preferences of the state queried by the UCTTreeWalkPolicy. Consider replanning with planFromState"); break; // policy ill defined } UCTActionNode choice = this.getQGreedyNode(snode); if (choice != null) { policy.put(snode.state, choice.action); // set the policy List<UCTStateNode> successors = choice.getAllSuccessors(); // queue up all possible successors of this action for (UCTStateNode suc : successors) { queue.offer(suc); } } } }
/** * Returns the {@link UCTActionNode} with the highest average sample return. Note that this does * not use the upper confidence since planning is completed. * * @param snode the {@link UCTStateNode} for which to get the best {@link UCTActionNode}. * @return the {@link UCTActionNode} with the highest average sample return. */ protected UCTActionNode getQGreedyNode(UCTStateNode snode) { double maxQ = Double.NEGATIVE_INFINITY; UCTActionNode choice = null; for (UCTActionNode anode : snode.actionNodes) { // only select nodes that have been visited if (anode.n > 0 && anode.averageReturn() > maxQ) { maxQ = anode.averageReturn(); choice = anode; } } return choice; }