Beispiel #1
0
  public void integrateObservation(int[] succState, float currFitScore) {
    // If this is the first observation of the round
    if (state == null) {
      state = succState;
      action = new boolean[Environment.numberOfKeys];
      prevFitScore = 0;
      bestScore = 0;
      possibleActions = getPossibleActions(environment);

      // Unpack Values
      if (learnedParams.containsKey("weights")) {
        iter = (Iteration) learnedParams.get("iter");
        rm = (ReplayMemory) learnedParams.get("rm");
        weights = (HashMap<String, double[][]>) learnedParams.get("weights");
        System.out.println("Starting Simulation at iteration : " + Integer.toString(iter.value));
      } else {
        // If this is the first observation of the simulation/trials
        rm = new ReplayMemory(GlobalOptions.replaySize);
        weights = new HashMap<String, double[][]>();
        iter = new Iteration(1);
        learnedParams.put("weights", weights);
        learnedParams.put("rm", rm);
        learnedParams.put("iter", iter);
      }

      if (net == null) {
        numFeatures = state.length + possibleActions.size() + 1;
        int numActions = possibleActions.size();

        // Network Architecture
        List<LayerSpec> layerSpecs = new ArrayList<LayerSpec>();
        // Layer 1:
        layerSpecs.add(
            new LayerSpec(LayerFactory.TYPE_FULLY_CONNECTED, numFeatures, GlobalOptions.h1Size));
        layerSpecs.add(
            new LayerSpec(LayerFactory.TYPE_RELU, GlobalOptions.batchSize, GlobalOptions.h1Size));
        // Layer 2:
        layerSpecs.add(
            new LayerSpec(
                LayerFactory.TYPE_FULLY_CONNECTED, GlobalOptions.h1Size, GlobalOptions.h2Size));
        layerSpecs.add(
            new LayerSpec(LayerFactory.TYPE_RELU, GlobalOptions.batchSize, GlobalOptions.h2Size));
        // Layer 3:
        layerSpecs.add(new LayerSpec(LayerFactory.TYPE_FULLY_CONNECTED, GlobalOptions.h2Size, 1));

        net = new NeuralNet(layerSpecs, weights);
      }
    }

    // state and action denote (s,a) while succState and succAction denote (s'a')
    // Reward denotes r
    StateActionPair SAP = new StateActionPair(state, action);
    boolean[] succAction = findBestAction(environment, succState);
    StateActionPair succSAP = new StateActionPair(succState, succAction);
    double succBestScore = evalScore(succSAP);

    float reward = currFitScore - prevFitScore;
    if (GlobalOptions.useIndicatorRewards) {
      if (reward != 0) reward = reward > 0 ? 1.0f : -1.0f;
    }

    double trueScore = reward + GlobalOptions.dicount * succBestScore;
    rm.addMemory(extractFeatures(SAP)[0], trueScore);

    // Annealed learning rate and epsilon greedy
    if (iter.value % GlobalOptions.DECAY_STEP == 0
        && !GlobalOptions.testTime
        && GlobalOptions.LR > GlobalOptions.MIN_LR) {
      GlobalOptions.LR = GlobalOptions.LR * GlobalOptions.decayFactor;
      // RANDOM_ACTION_EPSILON = RANDOM_ACTION_EPSILON * DECAY_FACTOR;
      System.out.println(
          "Decay Step - LR : "
              + Double.toString(GlobalOptions.LR)
              + " Epsilon : "
              + Double.toString(randomJump));
    }

    // only do this update on every n-th iteration
    if (iter.value % GlobalOptions.UPDATE_INTERVAL == 0 && !GlobalOptions.testTime) {
      List<double[][]> batch = rm.sample(GlobalOptions.batchSize);
      double[][] trainX = batch.get(0);
      double[][] trainy = batch.get(1);
      double[][] pred = net.forward(trainX);
      double[][] trainError = Matrix.subtract(pred, trainy);
      double regError = 0.5 * GlobalOptions.regularizationLamda * net.getWeightSq();
      trainError = Matrix.scalarAdd(trainError, regError);
      net.backprop(trainError, GlobalOptions.LR, GlobalOptions.regularizationLamda);
    }

    if (iter.value % GlobalOptions.STAT_INTERVAL == 0 && !GlobalOptions.testTime) {
      // Print learning statistics - on every nth iteration
      double error = (evalScore(SAP) - trueScore);
      stats.addError(error);
      stats.addWeights(net);
      stats.addLearningRate(GlobalOptions.LR);
      stats.addEpsilonGreedy(randomJump);
      stats.flush();
    }

    // Update Persistent Parameters
    iter.value++;
    state = succState;
    action = succAction;
    prevFitScore = currFitScore;
  }
Beispiel #2
0
 // TODO: extractFeatures once in integrateObservation and store rather than doing it twice per
 // round?
 private double evalScore(StateActionPair sap) {
   double[][] features = extractFeatures(sap);
   return net.forward(features)[0][0];
 }