protected int maxValue(State s, int alpha, int beta, int counter) { // System.out.println(counter); if (s.isTerminal() || counter > maxdepth) return utility(s); else if (!s.isTerminal()) { List<Move> moves = s.getPossibleMoves(); int maxv = Integer.MIN_VALUE; // System.out.println(moves.size()); for (Move move : moves) { // System.out.println("max"); int v = minValue(result(s, move), alpha, beta, counter + 1); // utility(s); /*int m = minValue(result(s,move),alpha,beta,counter++); if( m > v) v = m;*/ // System.out.println(); if (v > maxv) { maxv = v; } if (maxv >= beta) return maxv; alpha = Math.max(alpha, maxv); } return maxv; } else return 0; }
public static float trajectoryReward( State initialState, Policy p, int nbOfSteps, LinkedList<State> statesTrajectory, LinkedList<Action> actionsTrajectory, LinkedList<Float> rewardsTrajectory) { State s = initialState; Action a = p.chooseAction(s); float trajectoryReward = reward(s, a); if (statesTrajectory != null) { statesTrajectory.add(s); actionsTrajectory.add(a); rewardsTrajectory.add(reward(s, a)); } int nbOfIterations = 1; while (!s.isTerminal() && nbOfIterations < nbOfSteps) { s = MarkovDecisionProcess.nextState(s, a); a = p.chooseAction(s); trajectoryReward += reward(s, a); if (statesTrajectory != null) { statesTrajectory.add(s); actionsTrajectory.add(a); rewardsTrajectory.add(reward(s, a)); } nbOfIterations++; } return trajectoryReward; }
protected int minValue(State s, int alpha, int beta, int counter) { // System.out.println(counter); if (s.isTerminal() || counter > maxdepth) return utility(s); else if (!s.isTerminal()) { List<Move> moves = s.getPossibleMoves(); int minv = Integer.MAX_VALUE; // System.out.println(moves.size()); for (Move move : moves) { // System.out.println("min"); int v = maxValue(result(s, move), alpha, beta, counter + 1); // utility(s); /*int m = maxValue(result(s,move),alpha,beta,counter++); if( m < v) v = m;*/ if (v < minv) { minv = v; } if (minv <= alpha) return minv; beta = Math.min(beta, minv); } return minv; } else return 0; }
public static float reward(State s, Action a) { float reward; if (s.isTerminal()) { reward = 0f; } else { State nextState = MarkovDecisionProcess.nextState(s, a, false); float nextStepDistance = nextState.getRelativeDistance(); if (nextStepDistance < SoccerParams.KICKABLE_MARGIN) { reward = 1000f; } else { reward = -1f; } } return reward; }