From 4a9327bd0e9dcbc71d0bb65cd79ca2f762f5bbc9 Mon Sep 17 00:00:00 2001 From: Jaeyoung Lee <jaeyoung.lee@uwaterloo.ca> Date: Thu, 24 Jan 2019 19:42:04 -0500 Subject: [PATCH] Improve and Bug-fix DQNLearner and environments. - Added RestrictedEpsGreedyPolicy and RestrictedGreedyPolicy and use them as policy and test_policy in DQNLearner. Now, the agent never chooses the action corresponding to -inf Q-value if there is at least one action with finite Q-value (if not, it chooses any action randomly, which is necessary for compatibility with keras-rl -- see the comments in select_action). - Now, generate_scenario in SimpleIntersectionEnv generates veh_ahead_scenario even when randomize_special_scenario = 1. - In EpisodicEnvBase, the terminal reward is by default determined by the minimum one; - Small change of initiation_condition of EpisodicEnvBase (simplified); --- backends/kerasrl_learner.py | 99 +++++++++++++++++-- env/env_base.py | 9 +- .../simple_intersection_env.py | 2 +- options/simple_intersection/maneuver_base.py | 3 +- 4 files changed, 98 insertions(+), 15 deletions(-) diff --git a/backends/kerasrl_learner.py b/backends/kerasrl_learner.py index a24c4da..3d79734 100644 --- a/backends/kerasrl_learner.py +++ b/backends/kerasrl_learner.py @@ -8,7 +8,7 @@ from keras.callbacks import TensorBoard from rl.agents import DDPGAgent, DQNAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess -from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy +from rl.policy import GreedyQPolicy, EpsGreedyQPolicy, MaxBoltzmannQPolicy from rl.callbacks import ModelIntervalCheckpoint @@ -229,6 +229,7 @@ class DQNLearner(LearnerBase): model=None, policy=None, memory=None, + test_policy=None, **kwargs): """The constructor which sets the properties of the class. @@ -236,8 +237,8 @@ class DQNLearner(LearnerBase): input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given - policy: KerasRL Policy. Uses default SequentialMemory if not given - memory: KerasRL Memory. Uses default BoltzmannQPolicy if not given + policy: KerasRL Policy. Uses default MaxBoltzmannQPolicy if not given + memory: KerasRL Memory. Uses default SequentialMemory if not given **kwargs: other optional key-value arguments with defaults defined in property_defaults """ super(DQNLearner, self).__init__(input_shape, nb_actions, **kwargs) @@ -255,12 +256,14 @@ class DQNLearner(LearnerBase): model = self.get_default_model() if policy is None: policy = self.get_default_policy() + if test_policy is None: + test_policy = self.get_default_test_policy() if memory is None: memory = self.get_default_memory() self.low_level_policies = low_level_policies - self.agent_model = self.create_agent(model, policy, memory) + self.agent_model = self.create_agent(model, policy, memory, test_policy) def get_default_model(self): """Creates the default model. @@ -269,7 +272,6 @@ class DQNLearner(LearnerBase): """ model = Sequential() model.add(Flatten(input_shape=(1, ) + self.input_shape)) - #model.add(Dense(64)) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(64)) @@ -283,7 +285,10 @@ class DQNLearner(LearnerBase): return model def get_default_policy(self): - return MaxBoltzmannQPolicy(eps=0.3) + return RestrictedEpsGreedyQPolicy(0.3) + + def get_default_test_policy(self): + return RestrictedGreedyQPolicy() def get_default_memory(self): """Creates the default memory model. @@ -294,7 +299,7 @@ class DQNLearner(LearnerBase): limit=self.mem_size, window_length=self.mem_window_length) return memory - def create_agent(self, model, policy, memory): + def create_agent(self, model, policy, memory, test_policy): """Creates a KerasRL DDPGAgent with given components. Args: @@ -413,6 +418,86 @@ class DQNLearner(LearnerBase): return relevant +class RestrictedEpsGreedyQPolicy(EpsGreedyQPolicy): + """Implement the epsilon greedy policy + + Restricted Eps Greedy policy. + This policy ensures that it never chooses the action whose value is -inf + + """ + + def __init__(self, eps=.1): + super(RestrictedEpsGreedyQPolicy, self).__init__(eps) + + def select_action(self, q_values): + """Return the selected action + + # Arguments + q_values (np.ndarray): List of the estimations of Q for each action + + # Returns + Selection action + """ + assert q_values.ndim == 1 + nb_actions = q_values.shape[0] + index = list() + + for i in range(0, nb_actions): + if q_values[i] != -np.inf: + index.append(i) + + # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions + # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. + # In this case, we choose a policy randomly. + if len(index) < 1: + action = np.random.random_integers(0, nb_actions - 1) + + elif np.random.uniform() <= self.eps: + action = index[np.random.random_integers(0, len(index) - 1)] + + else: + action = np.argmax(q_values) + + return action + + +class RestrictedGreedyQPolicy(GreedyQPolicy): + """Implement the epsilon greedy policy + + Restricted Greedy policy. + This policy ensures that it never chooses the action whose value is -inf + + """ + + def select_action(self, q_values): + """Return the selected action + + # Arguments + q_values (np.ndarray): List of the estimations of Q for each action + + # Returns + Selection action + """ + assert q_values.ndim == 1 + nb_actions = q_values.shape[0] + restricted_q_values = list() + + for i in range(0, nb_actions): + if q_values[i] != -np.inf: + restricted_q_values.append(q_values[i]) + + # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions + # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. + # In this case, we choose a policy randomly. + if len(restricted_q_values) < 1: + action = np.random.random_integers(0, nb_actions - 1) + + else: + action = np.argmax(restricted_q_values) + + return action + + class DQNAgentOverOptions(DQNAgent): def __init__(self, model, diff --git a/env/env_base.py b/env/env_base.py index 3786c34..4d28722 100644 --- a/env/env_base.py +++ b/env/env_base.py @@ -25,7 +25,8 @@ class EpisodicEnvBase(GymCompliantEnvBase): # three types possible ('min', 'max', or 'sum'); # See _reward_superposition below. - terminal_reward_type = 'max' + # TODO: consider the case, where every terminal reward is None. Make this class have a default terminal value (not None) and use it in this case. + terminal_reward_type = 'min' #: If true, the maneuver terminates when the goal has been achieved. _terminate_in_goal = False @@ -140,13 +141,11 @@ class EpisodicEnvBase(GymCompliantEnvBase): def _reset_model_checker(self, AP): - self.__mc_AP = int(AP) - if self._LTL_preconditions_enable: for LTL_precondition in self._LTL_preconditions: LTL_precondition.reset_property() - if LTL_precondition.enabled: - LTL_precondition.check_incremental(self.__mc_AP) + + self._incremental_model_checking(AP) def _set_mc_AP(self, AP): self.__mc_AP = int(AP) diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py index 3855aea..32f50ef 100644 --- a/env/simple_intersection/simple_intersection_env.py +++ b/env/simple_intersection/simple_intersection_env.py @@ -271,7 +271,7 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): # stopped_car_scenario = bool(np.random.randint(0, 1)) TODO: this scenario may not work n_others_stopped_in_stop_region = np.random.randint( 0, min(3, n_others - stopped_car_scenario)) - veh_ahead_scenario = bool(np.random.randint(0, 1)) + veh_ahead_scenario = bool(np.random.randint(0, 1)) or veh_ahead_scenario if n_others_stopped_in_stop_region > min( n_others - stopped_car_scenario, 3): diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py index 275e654..fbbc3ed 100644 --- a/options/simple_intersection/maneuver_base.py +++ b/options/simple_intersection/maneuver_base.py @@ -332,8 +332,7 @@ class ManeuverBase(EpisodicEnvBase): Returns True if the condition is satisfied, and False otherwise. """ - return not (self.env.termination_condition or self.violation_happened) and \ - self.extra_initiation_condition + return not self.termination_condition and self.extra_initiation_condition @property def extra_initiation_condition(self): -- GitLab