from .learner_base import LearnerBase from keras.models import Sequential, Model from keras.layers import Dense, Activation, Flatten, Input, Concatenate from keras.optimizers import Adam from keras.callbacks import TensorBoard from rl.agents import DDPGAgent, DQNAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy from rl.callbacks import ModelIntervalCheckpoint import numpy as np class DDPGLearner(LearnerBase): def __init__(self, input_shape=(48, ), nb_actions=2, actor=None, critic=None, critic_action_input=None, memory=None, random_process=None, **kwargs): """The constructor which sets the properties of the class. Args: input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; actor: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given critic: Keras Model of critic which takes concatenation of observation and action and outputs a single value. Uses default if not given critic_action_input: Keras Input which was used in creating action input of the critic model. Uses default critic and action_input if not specified memory: KerasRL Memory. Uses default SequentialMemory if not given random_process: KerasRL random process. Uses default OrnsteinUhlenbeckProcess if not given **kwargs: other optional key-value arguments with defaults defined in property_defaults """ super(DDPGLearner, self).__init__(input_shape, nb_actions, **kwargs) property_defaults = { "mem_size": 100000, # size of memory "mem_window_length": 1, # window length of memory "oup_theta": 0.15, # OrnsteinUhlenbeckProcess theta "oup_mu": 0, # OrnsteinUhlenbeckProcess mu "oup_sigma": 1, # OrnsteinUhlenbeckProcess sigma "oup_sigma_min": 0.5, # OrnsteinUhlenbeckProcess sigma min "oup_annealing_steps": 500000, # OrnsteinUhlenbeckProcess n-step annealing "nb_steps_warmup_critic": 100, # steps for critic to warmup "nb_steps_warmup_actor": 100, # steps for actor to warmup "target_model_update": 1e-3 # target model update frequency } for (prop, default) in property_defaults.items(): setattr(self, prop, kwargs.get(prop, default)) if actor is None: actor = self.get_default_actor_model() if critic is None or critic_action_input is None: critic, critic_action_input = self.get_default_critic_model() if memory is None: memory = self.get_default_memory() if random_process is None: random_process = self.get_default_randomprocess() #TODO: Add output scaling self.agent_model = self.create_agent( actor, critic, critic_action_input, memory, random_process) def get_default_actor_model(self): """Creates the default actor model. Returns: Keras Model object of actor """ actor = Sequential() actor.add(Flatten(input_shape=(1, ) + self.input_shape)) actor.add(Dense(64, use_bias=False)) actor.add(Activation('relu')) actor.add(Dense(64, use_bias=False)) actor.add(Activation('relu')) actor.add(Dense(self.nb_actions, use_bias=True)) actor.add(Activation('tanh')) # print(actor.summary()) return actor def get_default_critic_model(self): """Creates the default critic model. Returns: Keras Model object of critic """ action_input = Input(shape=(self.nb_actions, ), name='action_input') observation_input = Input( shape=(1, ) + self.input_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(64, use_bias=False)(x) x = Activation('relu')(x) x = Dense(64, use_bias=False)(x) x = Activation('relu')(x) x = Dense(64, use_bias=False)(x) x = Activation('relu')(x) x = Dense(1, use_bias=True)(x) #x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) return critic, action_input def get_default_randomprocess(self): """Creates the default random process model. Returns: KerasRL OrnsteinUhlenbeckProcess object """ random_process = OrnsteinUhlenbeckProcess( size=self.nb_actions, theta=self.oup_theta, mu=self.oup_mu, sigma=self.oup_sigma, sigma_min=self.oup_sigma_min, n_steps_annealing=self.oup_annealing_steps) return random_process def get_default_memory(self): """Creates the default memory model. Returns: KerasRL SequentialMemory object """ memory = SequentialMemory( limit=self.mem_size, window_length=self.mem_window_length) return memory def create_agent(self, actor, critic, critic_action_input, memory, random_process): """Creates a KerasRL DDPGAgent with given components. Args: actor: Keras Model of actor which takes observation as input and outputs actions. critic: Keras Model of critic that takes concatenation of observation and action and outputs a single value. critic_action_input: Keras Input which was used in creating action input of the critic model. memory: KerasRL Memory. random_process: KerasRL random process. Returns: KerasRL DDPGAgent object """ agent = DDPGAgent( nb_actions=self.nb_actions, actor=actor, critic=critic, critic_action_input=critic_action_input, memory=memory, nb_steps_warmup_critic=self.nb_steps_warmup_critic, nb_steps_warmup_actor=self.nb_steps_warmup_actor, random_process=random_process, gamma=self.gamma, target_model_update=1e-3) # TODO: give params like lr_actor and lr_critic to set different lr of Actor and Critic. agent.compile( [ Adam(lr=self.lr * 1e-2, clipnorm=1.), Adam(lr=self.lr, clipnorm=1.) ], metrics=['mae']) return agent def train(self, env, nb_steps=1000000, visualize=False, verbose=1, log_interval=10000, nb_max_episode_steps=200, model_checkpoints=False, checkpoint_interval=100000, tensorboard=False): callbacks = [] if model_checkpoints: callbacks += [ ModelIntervalCheckpoint( './checkpoints/checkpoint_weights.h5f', interval=checkpoint_interval) ] if tensorboard: callbacks += [TensorBoard(log_dir='./logs')] self.agent_model.fit( env, nb_steps=nb_steps, visualize=visualize, verbose=verbose, log_interval=log_interval, nb_max_episode_steps=nb_max_episode_steps, callbacks=callbacks) def save_model(self, file_name="test_weights.h5f", overwrite=True): self.agent_model.save_weights(file_name, overwrite=True) def test_model(self, env, nb_episodes=50, visualize=True, nb_max_episode_steps=200): self.agent_model.test( env, nb_episodes=nb_episodes, visualize=visualize, nb_max_episode_steps=nb_max_episode_steps) def load_model(self, file_name="test_weights.h5f"): self.agent_model.load_weights(file_name) def predict(self, observation): return self.agent_model.forward(observation) class DQNLearner(LearnerBase): def __init__(self, input_shape=(48, ), nb_actions=5, low_level_policies=None, model=None, policy=None, memory=None, **kwargs): """The constructor which sets the properties of the class. Args: input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given policy: KerasRL Policy. Uses default SequentialMemory if not given memory: KerasRL Memory. Uses default BoltzmannQPolicy if not given **kwargs: other optional key-value arguments with defaults defined in property_defaults """ super(DQNLearner, self).__init__(input_shape, nb_actions, **kwargs) property_defaults = { "mem_size": 100000, # size of memory "mem_window_length": 1, # window length of memory "target_model_update": 1e-3, # target model update frequency "nb_steps_warmup": 100, # steps for model to warmup } for (prop, default) in property_defaults.items(): setattr(self, prop, kwargs.get(prop, default)) if model is None: model = self.get_default_model() if policy is None: policy = self.get_default_policy() if memory is None: memory = self.get_default_memory() self.low_level_policies = low_level_policies self.agent_model = self.create_agent(model, policy, memory) def get_default_model(self): """Creates the default model. Returns: Keras Model object of actor """ model = Sequential() model.add(Flatten(input_shape=(1, ) + self.input_shape)) model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(self.nb_actions)) model.add(Activation('linear')) # print(model.summary()) return model def get_default_policy(self): return MaxBoltzmannQPolicy(eps=0.3) def get_default_memory(self): """Creates the default memory model. Returns: KerasRL SequentialMemory object """ memory = SequentialMemory( limit=self.mem_size, window_length=self.mem_window_length) return memory def create_agent(self, model, policy, memory): """Creates a KerasRL DDPGAgent with given components. Args: model: Keras Model of model which takes observation as input and outputs discrete actions. memory: KerasRL Memory. Returns: KerasRL DQN object """ agent = DQNAgentOverOptions( model=model, low_level_policies=self.low_level_policies, nb_actions=self.nb_actions, memory=memory, nb_steps_warmup=self.nb_steps_warmup, target_model_update=self.target_model_update, policy=policy, enable_dueling_network=True) agent.compile(Adam(lr=self.lr), metrics=['mae']) return agent def train(self, env, nb_steps=1000000, visualize=False, nb_max_episode_steps=200, tensorboard=False, model_checkpoints=False, checkpoint_interval=10000): callbacks = [] if model_checkpoints: callbacks += [ ModelIntervalCheckpoint( './checkpoints/checkpoint_weights.h5f', interval=checkpoint_interval) ] if tensorboard: callbacks += [TensorBoard(log_dir='./logs')] self.agent_model.fit( env, nb_steps=nb_steps, visualize=visualize, verbose=1, nb_max_episode_steps=nb_max_episode_steps, callbacks=callbacks) def save_model(self, file_name="test_weights.h5f", overwrite=True): self.agent_model.save_weights(file_name, overwrite=True) # TODO: very environment specific. Make it general def test_model(self, env, nb_episodes=5, visualize=True, nb_max_episode_steps=400, success_reward_threshold=100): print("Testing for {} episodes".format(nb_episodes)) success_count = 0 termination_reason_counter = {} for n in range(nb_episodes): env.reset() terminal = False step = 0 episode_reward = 0 while not terminal and step <= nb_max_episode_steps: if visualize: env.render() features, R, terminal, info = env.execute_controller_policy() step += 1 episode_reward += R if terminal: if 'episode_termination_reason' in info: termination_reason = info['episode_termination_reason'] if termination_reason in termination_reason_counter: termination_reason_counter[termination_reason] += 1 else: termination_reason_counter[termination_reason] = 1 env.reset() if episode_reward >= success_reward_threshold: success_count += 1 print("Episode {}: steps:{}, reward:{}".format( n + 1, step, episode_reward)) print("\nPolicy succeeded {} times!".format(success_count)) print("Failures due to:") print(termination_reason_counter) return [success_count, termination_reason_counter] def load_model(self, file_name="test_weights.h5f"): self.agent_model.load_weights(file_name) def predict(self, observation): return self.agent_model.forward(observation) def get_q_value(self, observation, action): return self.agent_model.get_modified_q_values(observation)[action] def get_q_value_using_option_alias(self, observation, option_alias): action_num = self.agent_model.low_level_policy_aliases.index( option_alias) return self.agent_model.get_modified_q_values(observation)[action_num] def get_softq_value_using_option_alias(self, observation, option_alias): action_num = self.agent_model.low_level_policy_aliases.index( option_alias) q_values = self.agent_model.get_modified_q_values(observation) max_q_value = np.abs(np.max(q_values)) q_values = [np.exp(q_value / max_q_value) for q_value in q_values] relevant = q_values[action_num] / np.sum(q_values) return relevant class DQNAgentOverOptions(DQNAgent): def __init__(self, model, low_level_policies, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(DQNAgentOverOptions, self).__init__( model, policy, test_policy, enable_double_dqn, enable_dueling_network, dueling_type, *args, **kwargs) self.low_level_policies = low_level_policies if low_level_policies is not None: self.low_level_policy_aliases = list( self.low_level_policies.keys()) def __get_invalid_node_indices(self): """Returns a list of option indices that are invalid according to initiation conditions.""" invalid_node_indices = list() for index, option_alias in enumerate(self.low_level_policy_aliases): self.low_level_policies[option_alias].reset_maneuver() if not self.low_level_policies[option_alias].initiation_condition: invalid_node_indices.append(index) return invalid_node_indices def forward(self, observation): q_values = self.get_modified_q_values(observation) if self.training: action = self.policy.select_action(q_values=q_values) else: action = self.test_policy.select_action(q_values=q_values) # Book-keeping. self.recent_observation = observation self.recent_action = action return action def get_modified_q_values(self, observation): state = self.memory.get_recent_state(observation) q_values = self.compute_q_values(state) if self.low_level_policies is not None: invalid_node_indices = self.__get_invalid_node_indices() for node_index in invalid_node_indices: q_values[node_index] = -np.inf return q_values