baselines_learner.py 4.22 KB
Newer Older
Aravind Bk's avatar
Aravind Bk committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
from .learner_base import LearnerBase

# TODO: make sure that the package for PPO2 is installed.
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy

import numpy as np


class PPO2Agent(LearnerBase):
    def __init__(self,
                 input_shape,
                 nb_actions,
                 env,
                 policy=None,
                 tensorboard=False,
                 log_path="./logs",
                 **kwargs):
        """The constructor which sets the properties of the class.

        Args:
            input_shape: Shape of observation space, e.g (10,);
            nb_actions: number of values in action space;
            env: env on which the agent learns
            policy: stable_baselines Policy object. default is MlpPolicy
            tensorboard: whether to integrate tensorboard or not
            log_path="./logs",
            **kwargs: other optional key-value arguments with defaults defined in property_defaults
        """
        super(PPO2Agent, self).__init__(input_shape, nb_actions, **kwargs)

        if policy is None:
            policy = self.get_default_policy()

        self.log_path = log_path

Ashish Gaurav's avatar
Ashish Gaurav committed
38 39 40
        self.env = DummyVecEnv([
            lambda: env
        ])  #PPO2 requried a vectorized environment for parallel training
Aravind Bk's avatar
Aravind Bk committed
41 42 43 44 45 46 47 48 49 50
        self.agent_model = self.create_agent(policy, tensorboard)

    def get_default_policy(self):
        """Creates the default policy.

        Returns:     stable_baselines Policy object. default is MlpPolicy
        """
        return MlpPolicy

    def create_agent(self, policy, tensorboard):
Ashish Gaurav's avatar
Ashish Gaurav committed
51
        """Creates a PPO agent.
Aravind Bk's avatar
Aravind Bk committed
52

Ashish Gaurav's avatar
Ashish Gaurav committed
53
        Returns:     stable_baselines PPO2 object
Aravind Bk's avatar
Aravind Bk committed
54 55
        """
        if tensorboard:
Ashish Gaurav's avatar
Ashish Gaurav committed
56 57
            return PPO2(
                policy, self.env, verbose=1, tensorboard_log=self.log_path)
Aravind Bk's avatar
Aravind Bk committed
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
        else:
            return PPO2(policy, self.env, verbose=1)

    def fit(self,
            env=None,
            nb_steps=1000000,
            visualize=False,
            nb_max_episode_steps=200):

        # PPO2 callback is only called each episode (not step) so cannot render whole episode
        # To render each step, add self.env.render() in at Runner class method run() in stable_baselines ppo2.py
        callback = self.__render_env_while_learning if visualize else None
        self.agent_model.learn(total_timesteps=nb_steps, callback=callback)

    @staticmethod
    def __render_env_while_learning(_locals, _globals):
        _locals['self'].env.render()

    def save_weights(self, file_name="test_weights.h5f", overwrite=True):
        self.agent_model.save(file_name)

    def test_model(self,
                   env=None,
                   nb_episodes=50,
                   visualize=True,
                   nb_max_episode_steps=200):

        episode_rewards = [0.0]
        obs = self.env.reset()
        current_episode = 1
        current_step = 0
        while current_episode <= nb_episodes:
            # _states are only useful when using LSTM policies
            action, _states = self.agent_model.predict(obs)

            # here, action, rewards  and dones are arrays
            # because we are using vectorized env
            obs, rewards, dones, info = self.env.step(action)
            current_step += 1

            if visualize:
                self.env.render()

            # Stats
            episode_rewards[-1] += rewards[0]
            if dones[0] or current_step > nb_max_episode_steps:
                obs = self.env.reset()
Ashish Gaurav's avatar
Ashish Gaurav committed
105 106
                print("Episode ", current_episode, "reward: ",
                      episode_rewards[-1])
Aravind Bk's avatar
Aravind Bk committed
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
                episode_rewards.append(0.0)
                current_episode += 1
                current_step = 0

        # Compute mean reward for the last 100 episodes
        mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
        print("Mean reward over last 100 episodes:", mean_100ep_reward)

    def load_weights(self, file_name="test_weights.h5f"):
        self.agent_model = PPO2.load(file_name)

    def forward(self, observation):
        return self.agent_model.predict(observation)

    def set_environment(self, env):
        self.env = DummyVecEnv([lambda: env])
        self.agent_model.set_env(self.env)