public static void main(String[] args) { /* Log level*/ Logger.getRootLogger().setLevel(Level.INFO); /** ************* Setup Agent ************* */ /* create RLGlue Agent to get task specification (needed by the learners Q-function approximator) */ RlGlueAgent rlGlueAgent = new RlGlueAgent(); // create an agentLoader that will start the agent when its run method is called AgentLoader agentLoader = new AgentLoader(rlGlueAgent); // create thread so that the agent and environment can run asynchronously Thread agentThread = new Thread(agentLoader); // start the thread agentThread.start(); String taskSpec = RLGlue.RL_init(); System.out.println("Task-Specification: " + taskSpec); /** * ************************************ Configure Tile-Coding approximation * ************************************ */ // BEGIN TILE CODING APPROXIMATION // the number of tilings/layers int nTilings = 5; // the configuration. {from, to, number if discs} double[][] config = new double[][] { {0, 1.0, 5}, {0, 1.0, 5}, }; // create square tilings Network net = new Network(); net.setIsNormalized(true); double[][] optimizationConfig = config.clone(); net.setFeatureGenerator( new GridHashFeatureGenerator(optimizationConfig, new TileAndIndexBoundingBoxCalculator())); net.add(TileCodingFactory.createTilings(config, nTilings)); // setup Q-Function QFeatureFunction Q = new QFeatureFunction(net, rlGlueAgent.getTeachingboxActionSet()); // END TILE-CODING /** * *************************************** setup policy, learner & the TB's agent * *************************************** */ // the ActionSet for the policy is read from the rlGlueAgent (RL_init must have been called // before!) EpsilonGreedyPolicy pi = new EpsilonGreedyPolicy(Q, rlGlueAgent.getTeachingboxActionSet(), 0.1); System.out.println("POLICY-LEARNER ActionSet: " + rlGlueAgent.getTeachingboxActionSet()); GradientDescentSarsaLearner learner = new GradientDescentSarsaLearner(Q, net, rlGlueAgent.getTeachingboxActionSet()); learner.setAlpha(0.5); learner.setGamma(1.0); learner.setLambda(0.9); Agent tbAgent = new Agent(pi); tbAgent.addObserver(learner); /* SET THE TEACHINGBOX-AGENT IN THE RL-GLUE-AGENT-ADAPTER */ rlGlueAgent.setTeachingBoxAgent(tbAgent); /** * ******************************* Setup Experiment and Plotting ******************************* */ RLGlueRemoteEnvironment rlEnv = new RLGlueRemoteEnvironment(); Experiment experiment = new Experiment(tbAgent, rlEnv, 100, 1000); // 3D PLOTTING // draw the maximum value of the QFunction // to plot the corresponding VFunction we just have to pass in the policy // as well as the actionSet ValueFunctionEQ V = new ValueFunctionEQ(Q); V.costfunction = true; Plotter Vplotter = new ValueFunctionSurfacePlotter(V, "[0:0.02:1.0]", "[0:0.02:1.0]", "PuddleWorld"); // use action runtime plotter, that calls the ValueFunctionPlotter every 10th episode Vplotter = new RuntimePlotter(Vplotter, RuntimePlotter.Mode.EPISODE, 10, net); // add the plotter as an observer to the experiment experiment.addObserver((RuntimePlotter) Vplotter); // RUN THE EXPERIMENT experiment.run(); // cleanup rl-glue at the end RLGlue.RL_cleanup(); System.exit(1); }
/** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { Logger.getRootLogger().setLevel(Level.DEBUG); // setup environment MountainCarEnv env = new MountainCarEnv(); // choose simga final double POS_STEP = (env.MAX_POS - env.MIN_POS) / 50; final double VEL_STEP = (env.MAX_VEL - env.MIN_VEL) / 50; final double[] sigma = new double[] {POS_STEP, VEL_STEP}; // create adaptive network adding rbfs Network net = new Network( new NoNodeNearby(new RadialBasisFunction(sigma, sigma), new RBFDistanceCalculator())); net.setIsNormalized(true); // setup Q-Function // QFeatureFunction Q = new QFeatureFunction(net, MountainCarEnv.ACTION_SET); QFeatureFunction Q = ObjectSerializer.load("MC_Q_NRBF.ser"); // setup policy GreedyPolicy pi = new GreedyPolicy(Q, MountainCarEnv.ACTION_SET); // create agent Agent agent = new Agent(pi); // experiment setups final int MAX_EPISODES = 1000; final int MAX_STEPS = 5000; final double alpha = 0.4; final double gamma = 1; final double lambda = 0.9; // setup experiment Experiment experiment = new Experiment(agent, env, MAX_EPISODES, MAX_STEPS); // setup learner GradientDescentQLearner learner = new GradientDescentQLearner(Q, net, MountainCarEnv.ACTION_SET); learner.setAlpha(alpha); learner.setGamma(gamma); learner.setLambda(lambda); // attach learner to agent agent.addObserver(learner); // Mountain-Car boundaries double[] bound = new double[] {env.MIN_POS, env.MAX_POS, env.MIN_VEL, env.MAX_VEL}; // grid discretization int[] isosamples = new int[] {60, 60}; // Initialize Policy Plotter PolicyPlotter3D policyPlotter = new PolicyPlotter3D(pi, bound, isosamples); policyPlotter.setLabels("Position", "Velocity", "Action"); policyPlotter.setFilename("mc-policy.gnuplot"); policyPlotter.setTitle("Mountain-Car Policy"); policyPlotter.setView(0.01, 0.01); // visualize Q-function // ValueFunctionEQ V = new ValueFunctionEQ(Q); // V.costfunction = true; // ValueFunctionSurfacePlotter qp = new ValueFunctionSurfacePlotter( // V, new double[] {env.MIN_POS, isosamples[0], env.MAX_POS}, // new double[] {env.MIN_VEL, isosamples[1], env.MAX_VEL}, // "Mountain-Car Q-Function"); // qp.getPlotter().setView(50, 19); // qp.getPlotter().getAxis("x").setLabel("Position"); // qp.getPlotter().getAxis("y").setLabel("Velocity"); // qp.getPlotter().getAxis("z").setLabel("Costs"); //// qp.setFilename("mc-Q.gnuplot"); // Initialize Q-function Plotter QFunctionPlotter3D Qplotter = new QFunctionPlotter3D(Q, bound, isosamples); Qplotter.setLabels("Position", "Velocity", "Costs"); Qplotter.setFilename("mc-Q.gnuplot"); Qplotter.setTitle("Mountain-Car Q-Function"); Qplotter.setCosts(true); Qplotter.setView(50, 19); // generate a plot every 20 episodes experiment.addObserver(new RuntimePlotter(policyPlotter, Mode.EPISODE, 20, net)); experiment.addObserver(new RuntimePlotter(Qplotter, Mode.EPISODE, 20, net)); // run experiment experiment.run(); ObjectSerializer.save("MC_Q_NRBF.ser", Q); }