/**
  * The policy is learned here
  *
  * @param sweeps = number of sweeps executed
  * @throws OptimizationException
  */
 public void learn(int sweeps) throws OptimizationException {
   largestDif = 0.0;
   // temporary table to store new V-values
   StateRepV newV = new StateRepV(0.0, false);
   for (int i = 0; i < sweeps; i++) {
     double diff = 0.0;
     // for each state
     for (int state = 0; state < StateRepV.nrStates; state++) {
       // solve the set of equations
       double[] values = solveEquations(state);
       // calculate difference in v-value
       if (state != 0) {
         diff = Math.abs(vValues.getV(state) - values[Action.nrActions]);
         if (diff > largestDif) {
           largestDif = diff;
         }
         newV.setValue(state, values[Action.nrActions]);
       }
       // repair values returned by solving the equations if neccessary
       for (int a = 0; a < Action.nrActions; a++) {
         if (values[a] < 0.00000001) {
           policy.setValue(state, Action.getAction(a), 0.00000001);
         } else if (values[a] > 0.99999999) {
           policy.setValue(state, Action.getAction(a), 1.0);
         } else {
           policy.setValue(state, Action.getAction(a), values[a]);
         }
       }
     }
     // put new values in V-table
     vValues = newV;
     newV = new StateRepV(init, false);
   }
 }
  /**
   * Maximize V while not violating the constraints
   *
   * @param state = state the maximization has to take place for
   * @return = array with values for pi's and v
   * @throws OptimizationException
   */
  private double[] solveEquations(int state) throws OptimizationException {
    Collection constraints = new ArrayList();
    // for each possible action of the prey
    for (int preyAction = 0; preyAction < Action.nrActions; preyAction++) {
      // initialize weigths for this constraint
      double[] Q = new double[Action.nrActions + 1];
      // for each possible action of the predator
      for (int predAction = 0; predAction < Action.nrActions; predAction++) {
        int newStatePred = policy.getLinearIndexForAction(state, Action.getAction(predAction));
        int newStatePrey =
            policy.getLinearIndexForAction(newStatePred, Action.getReverseAction(preyAction));
        // calculate expected reward R(s,a,o)
        double expReward = 0;
        if (preyAction == Action.Wait.getIntValue()) {
          expReward = policy.getReward(newStatePrey, false);
        } else {
          expReward =
              policy.getReward(newStatePrey, false) * (1.0 - Ptrip)
                  + policy.getReward(newStatePred, false) * Ptrip;
        }
        // add weight to constraint for this combitnation
        if (preyAction == Action.Wait.getIntValue()) {
          Q[predAction] = expReward + learningRate * vValues.getV(newStatePrey);
        } else {
          Q[predAction] =
              expReward
                  + learningRate * vValues.getV(newStatePrey) * (1.0 - Ptrip)
                  + learningRate * vValues.getV(newStatePred) * Ptrip;
        }
      }
      // add constraint weight for V
      Q[Action.nrActions] = -1.0;
      // add constraint
      constraints.add(new LinearConstraint(Q, Relationship.GEQ, 0));
    }

    // add constraints that probabilities need to be > 0
    for (int predAction = 0; predAction < Action.nrActions; predAction++) {
      double[] constraintProb = new double[Action.nrActions + 1];
      Arrays.fill(constraintProb, 0.0);
      constraintProb[predAction] = 1.0;
      constraints.add(new LinearConstraint(constraintProb, Relationship.GEQ, 0));
    }
    // add total is zero constraint
    double[] totalZero = new double[Action.nrActions + 1];
    Arrays.fill(totalZero, 1.0);
    totalZero[Action.nrActions] = 0.0;
    constraints.add(new LinearConstraint(totalZero, Relationship.EQ, 1.0));
    // build objective function
    double[] objective = new double[Action.nrActions + 1];
    Arrays.fill(objective, 0.0);
    objective[Action.nrActions] = 1.0;
    LinearObjectiveFunction f = new LinearObjectiveFunction(objective, 0);

    // solve and return
    RealPointValuePair solution =
        new SimplexSolver().optimize(f, constraints, GoalType.MAXIMIZE, false);
    return solution.getPoint();
  }
 /**
  * Returns the probability distribution over actions for a given state in the environment
  *
  * @param prey = position of the predator
  * @param predatorItself = position of the prey
  * @return probability distribution over actions
  */
 @Override
 public double[] policy(Position prey, Position predatorItself) {
   double[] pActions = new double[Action.nrActions];
   int linIndex = vValues.getLinearIndex(prey, predatorItself);
   for (int i = 0; i < Action.nrActions; i++) {
     int index = vValues.getMove(predatorItself, prey, i, false);
     pActions[index] = policy.getValue(linIndex, Action.getAction(i));
   }
   return pActions;
 }
 /**
  * returns a move based on the policy of the predator
  *
  * @param others = array list of positions with the position of the predator
  */
 @Override
 public void doMove(ArrayList<Position> others, boolean isPrey) {
   preyPos = new Position(others.get(0));
   int linIndex = policy.getLinearIndexFromPositions(myPos, preyPos);
   double[] prob = policy.getStateActionPairValues(linIndex);
   double[] probCum = new double[Action.nrActions];
   probCum[0] = prob[0];
   for (int i = 1; i < Action.nrActions; i++) {
     probCum[i] = probCum[i - 1] + prob[i];
   }
   probCum[Action.nrActions - 1] = 1.0;
   double p = Math.random();
   int action = -1;
   for (int i = 0; i < Action.nrActions; i++) {
     if (p <= probCum[i]) {
       action = i;
       break;
     }
   }
   myPos.adjustPosition(policy.getMove(myPos, preyPos, Action.getAction(action), false));
 }