From 4a9327bd0e9dcbc71d0bb65cd79ca2f762f5bbc9 Mon Sep 17 00:00:00 2001
From: Jaeyoung Lee <jaeyoung.lee@uwaterloo.ca>
Date: Thu, 24 Jan 2019 19:42:04 -0500
Subject: [PATCH] Improve and Bug-fix DQNLearner and environments.

- Added RestrictedEpsGreedyPolicy and RestrictedGreedyPolicy and use them as policy and test_policy in DQNLearner. Now, the agent never chooses the action corresponding to -inf Q-value if there is at least one action with finite Q-value (if not, it chooses any action randomly, which is necessary for compatibility with keras-rl --
 see the comments in select_action).

- Now, generate_scenario in SimpleIntersectionEnv generates veh_ahead_scenario even when randomize_special_scenario = 1.

- In EpisodicEnvBase, the terminal reward is by default determined by the minimum one;

- Small change of initiation_condition of EpisodicEnvBase (simplified);
---
 backends/kerasrl_learner.py                   | 99 +++++++++++++++++--
 env/env_base.py                               |  9 +-
 .../simple_intersection_env.py                |  2 +-
 options/simple_intersection/maneuver_base.py  |  3 +-
 4 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/backends/kerasrl_learner.py b/backends/kerasrl_learner.py
index a24c4da..3d79734 100644
--- a/backends/kerasrl_learner.py
+++ b/backends/kerasrl_learner.py
@@ -8,7 +8,7 @@ from keras.callbacks import TensorBoard
 from rl.agents import DDPGAgent, DQNAgent
 from rl.memory import SequentialMemory
 from rl.random import OrnsteinUhlenbeckProcess
-from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy
+from rl.policy import GreedyQPolicy, EpsGreedyQPolicy, MaxBoltzmannQPolicy
 
 from rl.callbacks import ModelIntervalCheckpoint
 
@@ -229,6 +229,7 @@ class DQNLearner(LearnerBase):
                  model=None,
                  policy=None,
                  memory=None,
+                 test_policy=None,
                  **kwargs):
         """The constructor which sets the properties of the class.
 
@@ -236,8 +237,8 @@ class DQNLearner(LearnerBase):
             input_shape: Shape of observation space, e.g (10,);
             nb_actions: number of values in action space;
             model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given
-            policy: KerasRL Policy. Uses default SequentialMemory if not given
-            memory: KerasRL Memory. Uses default BoltzmannQPolicy if not given
+            policy: KerasRL Policy. Uses default MaxBoltzmannQPolicy if not given
+            memory: KerasRL Memory. Uses default SequentialMemory if not given
             **kwargs: other optional key-value arguments with defaults defined in property_defaults
         """
         super(DQNLearner, self).__init__(input_shape, nb_actions, **kwargs)
@@ -255,12 +256,14 @@ class DQNLearner(LearnerBase):
             model = self.get_default_model()
         if policy is None:
             policy = self.get_default_policy()
+        if test_policy is None:
+            test_policy = self.get_default_test_policy()
         if memory is None:
             memory = self.get_default_memory()
 
         self.low_level_policies = low_level_policies
 
-        self.agent_model = self.create_agent(model, policy, memory)
+        self.agent_model = self.create_agent(model, policy, memory, test_policy)
 
     def get_default_model(self):
         """Creates the default model.
@@ -269,7 +272,6 @@ class DQNLearner(LearnerBase):
         """
         model = Sequential()
         model.add(Flatten(input_shape=(1, ) + self.input_shape))
-        #model.add(Dense(64))
         model.add(Dense(64))
         model.add(Activation('relu'))
         model.add(Dense(64))
@@ -283,7 +285,10 @@ class DQNLearner(LearnerBase):
         return model
 
     def get_default_policy(self):
-        return MaxBoltzmannQPolicy(eps=0.3)
+        return RestrictedEpsGreedyQPolicy(0.3)
+
+    def get_default_test_policy(self):
+        return RestrictedGreedyQPolicy()
 
     def get_default_memory(self):
         """Creates the default memory model.
@@ -294,7 +299,7 @@ class DQNLearner(LearnerBase):
             limit=self.mem_size, window_length=self.mem_window_length)
         return memory
 
-    def create_agent(self, model, policy, memory):
+    def create_agent(self, model, policy, memory, test_policy):
         """Creates a KerasRL DDPGAgent with given components.
 
         Args:
@@ -413,6 +418,86 @@ class DQNLearner(LearnerBase):
         return relevant
 
 
+class RestrictedEpsGreedyQPolicy(EpsGreedyQPolicy):
+    """Implement the epsilon greedy policy
+
+    Restricted Eps Greedy policy.
+    This policy ensures that it never chooses the action whose value is -inf
+
+    """
+
+    def __init__(self, eps=.1):
+        super(RestrictedEpsGreedyQPolicy, self).__init__(eps)
+
+    def select_action(self, q_values):
+        """Return the selected action
+
+        # Arguments
+            q_values (np.ndarray): List of the estimations of Q for each action
+
+        # Returns
+            Selection action
+        """
+        assert q_values.ndim == 1
+        nb_actions = q_values.shape[0]
+        index = list()
+
+        for i in range(0, nb_actions):
+            if q_values[i] != -np.inf:
+                index.append(i)
+
+        # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions
+        # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function.
+        # In this case, we choose a policy randomly.
+        if len(index) < 1:
+            action = np.random.random_integers(0, nb_actions - 1)
+
+        elif np.random.uniform() <= self.eps:
+            action = index[np.random.random_integers(0, len(index) - 1)]
+
+        else:
+            action = np.argmax(q_values)
+
+        return action
+
+
+class RestrictedGreedyQPolicy(GreedyQPolicy):
+    """Implement the epsilon greedy policy
+
+    Restricted Greedy policy.
+    This policy ensures that it never chooses the action whose value is -inf
+
+    """
+
+    def select_action(self, q_values):
+        """Return the selected action
+
+        # Arguments
+            q_values (np.ndarray): List of the estimations of Q for each action
+
+        # Returns
+            Selection action
+        """
+        assert q_values.ndim == 1
+        nb_actions = q_values.shape[0]
+        restricted_q_values = list()
+
+        for i in range(0, nb_actions):
+            if q_values[i] != -np.inf:
+                restricted_q_values.append(q_values[i])
+
+        # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions
+        # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function.
+        # In this case, we choose a policy randomly.
+        if len(restricted_q_values) < 1:
+            action = np.random.random_integers(0, nb_actions - 1)
+
+        else:
+            action = np.argmax(restricted_q_values)
+
+        return action
+
+
 class DQNAgentOverOptions(DQNAgent):
     def __init__(self,
                  model,
diff --git a/env/env_base.py b/env/env_base.py
index 3786c34..4d28722 100644
--- a/env/env_base.py
+++ b/env/env_base.py
@@ -25,7 +25,8 @@ class EpisodicEnvBase(GymCompliantEnvBase):
 
     # three types possible ('min', 'max', or 'sum');
     # See _reward_superposition below.
-    terminal_reward_type = 'max'
+    # TODO: consider the case, where every terminal reward is None. Make this class have a default terminal value (not None) and use it in this case.
+    terminal_reward_type = 'min'
 
     #: If true, the maneuver terminates when the goal has been achieved.
     _terminate_in_goal = False
@@ -140,13 +141,11 @@ class EpisodicEnvBase(GymCompliantEnvBase):
 
     def _reset_model_checker(self, AP):
 
-        self.__mc_AP = int(AP)
-
         if self._LTL_preconditions_enable:
             for LTL_precondition in self._LTL_preconditions:
                 LTL_precondition.reset_property()
-                if LTL_precondition.enabled:
-                    LTL_precondition.check_incremental(self.__mc_AP)
+
+            self._incremental_model_checking(AP)
 
     def _set_mc_AP(self, AP):
         self.__mc_AP = int(AP)
diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py
index 3855aea..32f50ef 100644
--- a/env/simple_intersection/simple_intersection_env.py
+++ b/env/simple_intersection/simple_intersection_env.py
@@ -271,7 +271,7 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase):
             # stopped_car_scenario = bool(np.random.randint(0, 1)) TODO: this scenario may not work
             n_others_stopped_in_stop_region = np.random.randint(
                 0, min(3, n_others - stopped_car_scenario))
-            veh_ahead_scenario = bool(np.random.randint(0, 1))
+            veh_ahead_scenario = bool(np.random.randint(0, 1)) or veh_ahead_scenario
 
         if n_others_stopped_in_stop_region > min(
                 n_others - stopped_car_scenario, 3):
diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py
index 275e654..fbbc3ed 100644
--- a/options/simple_intersection/maneuver_base.py
+++ b/options/simple_intersection/maneuver_base.py
@@ -332,8 +332,7 @@ class ManeuverBase(EpisodicEnvBase):
         Returns True if the condition is satisfied, and False otherwise.
         """
 
-        return not (self.env.termination_condition or self.violation_happened) and \
-               self.extra_initiation_condition
+        return not self.termination_condition and self.extra_initiation_condition
 
     @property
     def extra_initiation_condition(self):
-- 
GitLab