/**
   * computes a hash-backed policy for every state visited along the greedy path of the UCT tree.
   */
  public void computePolicyFromTree() {
    policy = new HashMap<StateHashTuple, GroundedAction>();

    if (this.planner.getRoot() == null) {
      return;
    }

    // define policy for all states that are expanded along the greedy path of the UCT tree
    LinkedList<UCTStateNode> queue = new LinkedList<UCTStateNode>();
    queue.add(planner.getRoot());
    while (queue.size() > 0) {

      UCTStateNode snode = queue.poll();

      if (!planner.containsActionPreference(snode)) {
        System.out.println(
            "UCT tree does not contain action preferences of the state queried by the UCTTreeWalkPolicy. Consider replanning with planFromState");
        break; // policy ill defined
      }

      UCTActionNode choice = this.getQGreedyNode(snode);
      if (choice != null) {

        policy.put(snode.state, choice.action); // set the policy

        List<UCTStateNode> successors =
            choice.getAllSuccessors(); // queue up all possible successors of this action
        for (UCTStateNode suc : successors) {
          queue.offer(suc);
        }
      }
    }
  }
  /**
   * Returns the {@link UCTActionNode} with the highest average sample return. Note that this does
   * not use the upper confidence since planning is completed.
   *
   * @param snode the {@link UCTStateNode} for which to get the best {@link UCTActionNode}.
   * @return the {@link UCTActionNode} with the highest average sample return.
   */
  protected UCTActionNode getQGreedyNode(UCTStateNode snode) {

    double maxQ = Double.NEGATIVE_INFINITY;
    UCTActionNode choice = null;

    for (UCTActionNode anode : snode.actionNodes) {

      // only select nodes that have been visited
      if (anode.n > 0 && anode.averageReturn() > maxQ) {
        maxQ = anode.averageReturn();
        choice = anode;
      }
    }

    return choice;
  }