from .learner_base import LearnerBase # TODO: make sure that the package for PPO2 is installed. from stable_baselines import PPO2 from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.policies import MlpPolicy import numpy as np class PPO2Agent(LearnerBase): def __init__(self, input_shape, nb_actions, env, policy=None, tensorboard=False, log_path="./logs", **kwargs): """The constructor which sets the properties of the class. Args: input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; env: env on which the agent learns policy: stable_baselines Policy object. default is MlpPolicy tensorboard: whether to integrate tensorboard or not log_path="./logs", **kwargs: other optional key-value arguments with defaults defined in property_defaults """ super(PPO2Agent, self).__init__(input_shape, nb_actions, **kwargs) if policy is None: policy = self.get_default_policy() self.log_path = log_path self.env = DummyVecEnv([lambda: env]) #PPO2 requried a vectorized environment for parallel training self.agent_model = self.create_agent(policy, tensorboard) def get_default_policy(self): """Creates the default policy. Returns: stable_baselines Policy object. default is MlpPolicy """ return MlpPolicy def create_agent(self, policy, tensorboard): """Creates a PPO agent Returns: stable_baselines PPO2 object """ if tensorboard: return PPO2(policy, self.env, verbose=1, tensorboard_log=self.log_path) else: return PPO2(policy, self.env, verbose=1) def fit(self, env=None, nb_steps=1000000, visualize=False, nb_max_episode_steps=200): # PPO2 callback is only called each episode (not step) so cannot render whole episode # To render each step, add self.env.render() in at Runner class method run() in stable_baselines ppo2.py callback = self.__render_env_while_learning if visualize else None self.agent_model.learn(total_timesteps=nb_steps, callback=callback) @staticmethod def __render_env_while_learning(_locals, _globals): _locals['self'].env.render() def save_weights(self, file_name="test_weights.h5f", overwrite=True): self.agent_model.save(file_name) def test_model(self, env=None, nb_episodes=50, visualize=True, nb_max_episode_steps=200): episode_rewards = [0.0] obs = self.env.reset() current_episode = 1 current_step = 0 while current_episode <= nb_episodes: # _states are only useful when using LSTM policies action, _states = self.agent_model.predict(obs) # here, action, rewards and dones are arrays # because we are using vectorized env obs, rewards, dones, info = self.env.step(action) current_step += 1 if visualize: self.env.render() # Stats episode_rewards[-1] += rewards[0] if dones[0] or current_step > nb_max_episode_steps: obs = self.env.reset() print ("Episode ", current_episode, "reward: ", episode_rewards[-1]) episode_rewards.append(0.0) current_episode += 1 current_step = 0 # Compute mean reward for the last 100 episodes mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1) print("Mean reward over last 100 episodes:", mean_100ep_reward) def load_weights(self, file_name="test_weights.h5f"): self.agent_model = PPO2.load(file_name) def forward(self, observation): return self.agent_model.predict(observation) def set_environment(self, env): self.env = DummyVecEnv([lambda: env]) self.agent_model.set_env(self.env)