From 7dc47bce50616f94dad25d86a9e1828c034ca2ba Mon Sep 17 00:00:00 2001
From: Jaeyoung Lee <jaeyoung.lee@uwaterloo.ca>
Date: Fri, 25 Jan 2019 00:02:00 -0500
Subject: [PATCH] Fix the bug of having -100000 or 10000 rewards sometime.

---
 backends/controller_base.py                   |   8 +-
 backends/kerasrl_learner.py                   |  23 ++--
 config.json                                   |   2 +-
 env/env_base.py                               |   2 +-
 env/simple_intersection/road_geokinemetry.py  |   2 +-
 .../simple_intersection_env.py                |   4 +-
 options/options_loader.py                     |   5 -
 options/simple_intersection/maneuver_base.py  |   4 +-
 options/simple_intersection/maneuvers.py      | 114 +++++++++---------
 9 files changed, 75 insertions(+), 89 deletions(-)

diff --git a/backends/controller_base.py b/backends/controller_base.py
index 1617036..ad31856 100644
--- a/backends/controller_base.py
+++ b/backends/controller_base.py
@@ -65,7 +65,7 @@ class ControllerBase(PolicyBase):
         total_reward += self.current_node.high_level_extra_reward
 
         # TODO for info
-        return observation, total_reward, self.env.termination_condition, info
+        return observation, total_reward, terminal, info
 
     # TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
     ''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition
@@ -76,9 +76,7 @@ class ControllerBase(PolicyBase):
 
     def low_level_step_current_node(self):
 
-        u_ego = self.current_node.low_level_policy(
-            self.current_node.get_reduced_features_tuple())
+        u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple())
         feature, R, terminal, info = self.current_node.step(u_ego)
         self.node_terminal_state_reached = terminal
-        return self.env.get_features_tuple(
-        ), R, self.env.termination_condition, info
+        return self.env.get_features_tuple(), R, self.env.termination_condition, info
diff --git a/backends/kerasrl_learner.py b/backends/kerasrl_learner.py
index 3d79734..405a9ba 100644
--- a/backends/kerasrl_learner.py
+++ b/backends/kerasrl_learner.py
@@ -317,6 +317,7 @@ class DQNLearner(LearnerBase):
             nb_steps_warmup=self.nb_steps_warmup,
             target_model_update=self.target_model_update,
             policy=policy,
+            test_policy=test_policy,
             enable_dueling_network=True)
 
         agent.compile(Adam(lr=self.lr), metrics=['mae'])
@@ -327,6 +328,8 @@ class DQNLearner(LearnerBase):
               env,
               nb_steps=1000000,
               visualize=False,
+              verbose=1,
+              log_interval=10000,
               nb_max_episode_steps=200,
               tensorboard=False,
               model_checkpoints=False,
@@ -346,7 +349,8 @@ class DQNLearner(LearnerBase):
             env,
             nb_steps=nb_steps,
             visualize=visualize,
-            verbose=1,
+            verbose=verbose,
+            log_interval=log_interval,
             nb_max_episode_steps=nb_max_episode_steps,
             callbacks=callbacks)
 
@@ -479,21 +483,13 @@ class RestrictedGreedyQPolicy(GreedyQPolicy):
             Selection action
         """
         assert q_values.ndim == 1
-        nb_actions = q_values.shape[0]
-        restricted_q_values = list()
-
-        for i in range(0, nb_actions):
-            if q_values[i] != -np.inf:
-                restricted_q_values.append(q_values[i])
 
-        # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions
-        # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function.
-        # In this case, we choose a policy randomly.
-        if len(restricted_q_values) < 1:
-            action = np.random.random_integers(0, nb_actions - 1)
+        if np.max(q_values) == - np.inf:
+            # every q_value is -np.inf, we choose action = 0
+            action = 0
 
         else:
-            action = np.argmax(restricted_q_values)
+            action = np.argmax(q_values)
 
         return action
 
@@ -521,6 +517,7 @@ class DQNAgentOverOptions(DQNAgent):
     def __get_invalid_node_indices(self):
         """Returns a list of option indices that are invalid according to
         initiation conditions."""
+
         invalid_node_indices = list()
         for index, option_alias in enumerate(self.low_level_policy_aliases):
             # TODO: Locate reset_maneuver to another place as this is a "get" function.
diff --git a/config.json b/config.json
index 5e40c77..b8d9b1c 100644
--- a/config.json
+++ b/config.json
@@ -3,7 +3,7 @@
 		"wait": "Wait",
 		"follow": "Follow",
 		"stop": "Stop",
-		"immediatestop": "ImmediateStop",
+		"changelane": "ChangeLane",
 		"keeplane": "KeepLane"
 	},
 
diff --git a/env/env_base.py b/env/env_base.py
index 4d28722..038d59a 100644
--- a/env/env_base.py
+++ b/env/env_base.py
@@ -157,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase):
         if self._terminate_in_goal and self.goal_achieved:
             return True
 
-        return self.violation_happened and self._LTL_preconditions_enable
+        return self.violation_happened
 
     @property
     def goal_achieved(self):
diff --git a/env/simple_intersection/road_geokinemetry.py b/env/simple_intersection/road_geokinemetry.py
index 63190be..602b3e5 100644
--- a/env/simple_intersection/road_geokinemetry.py
+++ b/env/simple_intersection/road_geokinemetry.py
@@ -93,7 +93,7 @@ vlanes = Route(
     [-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35,
     [-4.0, 4.0])
 
-intersection_width = vlanes.n_lanes * vlanes.width
+intersection_width =  vlanes.n_lanes * vlanes.width
 intersection_height = hlanes.n_lanes * hlanes.width
 
 intersection_width_w_offset = intersection_width + 2 * intersection_hoffset
diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py
index 32f50ef..a2faa92 100644
--- a/env/simple_intersection/simple_intersection_env.py
+++ b/env/simple_intersection/simple_intersection_env.py
@@ -51,7 +51,9 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase):
 
     #: The weight vector to calculate the cost. In the maneuver, cost_weights
     # can be set to a specific value which may be different than the default.
-    cost_weights = (1.0, 0.25, 0.1, 1.0, 100.0, 0.1, 0.25, 0.1)
+    # TODO: check _cost_weights in both here and ManeuverBase. The _cost_weights has to be substituted to here, but it doesn't sometimes.
+    _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
+                     100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)
 
     #TODO: Move this to constants
     # The empirical min and max of each term in the cost vector, which is used to normalize the values
diff --git a/options/options_loader.py b/options/options_loader.py
index bce1a6c..f4f7386 100644
--- a/options/options_loader.py
+++ b/options/options_loader.py
@@ -180,11 +180,6 @@ class OptionsGraph:
                 print("\n Warning: the trained low-level policy of \"" + key +
                       "\" does not exists; the manual policy will be used.\n")
 
-            # setting the cost weights for high-level policy training.
-            # TODO: this shouldn't be initialized here, but within ManeuverBase class (e.g. make some flag indicating high-level training and set the weights)...
-            maneuver._cost_weights = (100.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
-                                     100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
-
             if self.config["method"] == "mcts":
                 maneuver.timeout = np.inf
 
diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py
index fbbc3ed..82bae5e 100644
--- a/options/simple_intersection/maneuver_base.py
+++ b/options/simple_intersection/maneuver_base.py
@@ -30,7 +30,7 @@ class ManeuverBase(EpisodicEnvBase):
     # as a negative reward, so a cost will be summed up to the reward
     # with subtraction.
     _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
-                     100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
+                     100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)
 
     _extra_r_terminal = None
     _extra_r_on_timeout = None
@@ -38,7 +38,7 @@ class ManeuverBase(EpisodicEnvBase):
     #: the flag being False when _cost_weights is used without
     # modification; If True, then the action parts of _cost_weights
     # are increased for some edge cases (see the step method).
-    _extra_action_weights_flag = True
+    _extra_action_weights_flag = False
 
     #: the extra weights on the actions added to _cost_weights
     # for some edge cases when _extra_action_weights_flag = True.
diff --git a/options/simple_intersection/maneuvers.py b/options/simple_intersection/maneuvers.py
index 570b4ac..67c8318 100644
--- a/options/simple_intersection/maneuvers.py
+++ b/options/simple_intersection/maneuvers.py
@@ -15,11 +15,10 @@ class KeepLane(ManeuverBase):
         self._target_lane = self.env.ego.APs['lane']
 
     def _init_LTL_preconditions(self):
-        self._LTL_preconditions.append(LTLProperty("G ( not veh_ahead )", 0))
 
-        #self._LTL_preconditions.append(
-        #    LTLProperty("G ( not stopped_now )", 100,
-        #                self._enable_low_level_training_properties))
+        self._LTL_preconditions.append(
+            LTLProperty("G ( not stopped_now )", 100,
+                        self._enable_low_level_training_properties))
 
         self._LTL_preconditions.append(
             LTLProperty(
@@ -37,6 +36,7 @@ class KeepLane(ManeuverBase):
         self.env._terminate_in_goal = False
         self.env._reward_in_goal = None
         self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True
 
     def generate_validation_scenario(self):
         self.generate_scenario(
@@ -65,7 +65,7 @@ class KeepLane(ManeuverBase):
         return False
 
 
-class ImmediateStop(ManeuverBase):
+class Halt(ManeuverBase):
 
     _terminate_in_goal = True
     _reward_in_goal = None
@@ -102,6 +102,7 @@ class ImmediateStop(ManeuverBase):
         self.env._reward_in_goal = None
         self._reward_in_goal = 200
         self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True
 
     def generate_validation_scenario(self):
         self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos)
@@ -147,14 +148,11 @@ class Stop(ManeuverBase):
         self._target_lane = self.env.ego.APs['lane']
 
     def _init_LTL_preconditions(self):
-        #self._LTL_preconditions.append(
-        #    LTLProperty("G ( not has_stopped_in_stop_region )",
-        #                self._penalty(self._reward_in_goal)))
-
         self._LTL_preconditions.append(
-            LTLProperty("G ( not has_stopped_in_stop_region )", -150,
-                        not self._enable_low_level_training_properties))
+            LTLProperty("G ( not has_stopped_in_stop_region )",
+                        self._penalty(self._reward_in_goal), not self._enable_low_level_training_properties))
 
+        # before_intersection rather than "before_but_close_to_stop_region or in_stop_region"?
         self._LTL_preconditions.append(
             LTLProperty(
                 "G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )",
@@ -191,6 +189,7 @@ class Stop(ManeuverBase):
         self._reward_in_goal = 200
         self._penalty_in_violation = 150
         self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True
 
     def _low_level_manual_policy(self):
         return self.env.aggressive_driving_policy(EGO_INDEX)
@@ -234,27 +233,21 @@ class Wait(ManeuverBase):
     def _init_LTL_preconditions(self):
 
         self._LTL_preconditions.append(
+            LTLProperty("G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))",
+                        None, not self._enable_low_level_training_properties))  # not available in low-level training...
 
-        LTLProperty(
-            "G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", 0,
-            not self._enable_low_level_training_properties))  # not available in low-level training...
-
-        #LTLProperty(
-        #    "G ( (in_stop_region and stopped_now) U (highest_priority and intersection_is_clear))", 0,
-        #    not self._enable_low_level_training_properties))  # not available in low-level training...
-
-        #self._LTL_preconditions.append(
-        #    LTLProperty("G ( not (in_intersection and highest_priority) )",
-        #                self._penalty(self._reward_in_goal)))
+        self._LTL_preconditions.append(
+            LTLProperty("G ( not (in_intersection and highest_priority) )",
+                        self._penalty(self._reward_in_goal)))
 
-        #self._LTL_preconditions.append(
-        #    LTLProperty(
-        #        "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))
+        self._LTL_preconditions.append(
+           LTLProperty(
+               "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))
 
-        #self._LTL_preconditions.append(
-        #    LTLProperty(
-        #        "G ( (lane and target_lane) or (not lane and not target_lane) )",
-        #        100, self._enable_low_level_training_properties))
+        self._LTL_preconditions.append(
+           LTLProperty(
+               "G ( (lane and target_lane) or (not lane and not target_lane) )",
+               100, self._enable_low_level_training_properties))
 
     def _init_param(self):
         self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit
@@ -324,7 +317,7 @@ class ChangeLane(ManeuverBase):
 
     _violation_penalty_in_low_level_training = None
 
-    # high_level_extra_reward = -1000000
+    high_level_extra_reward = -50
 
     def _init_param(self):
         self._v_ref = rd.speed_limit
@@ -332,13 +325,17 @@ class ChangeLane(ManeuverBase):
 
     def _init_LTL_preconditions(self):
         self._LTL_preconditions.append(
-            LTLProperty("G ( on_route and not over_speed_limit )",
-                        self._violation_penalty_in_low_level_training,
-                        self._enable_low_level_training_properties))
+           LTLProperty("G ( on_route and not over_speed_limit )",
+                       self._violation_penalty_in_low_level_training,
+                       self._enable_low_level_training_properties))
 
         self._LTL_preconditions.append(
-            LTLProperty("G ( not stopped_now )",
-                        100, self._enable_low_level_training_properties))
+           LTLProperty("G ( not stopped_now )",
+                       100, self._enable_low_level_training_properties))
+
+        self._LTL_preconditions.append(
+           LTLProperty("G ( not in_intersection and not in_stop_region )",
+                       None, not self._enable_low_level_training_properties))  # activated only for the high-level case.
 
     @property
     def goal_achieved(self):
@@ -346,9 +343,9 @@ class ChangeLane(ManeuverBase):
         APs = self.env.ego.APs
         on_other_lane = APs['lane'] == self._target_lane
         achieved_y_displacement = np.sign(ego.y) * \
-                                  (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
+                                   (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
         return on_other_lane and APs['on_route'] and \
-               achieved_y_displacement and APs['parallel_to_lane']
+            achieved_y_displacement and APs['parallel_to_lane']
 
     def _low_level_manual_policy(self):
         return self.env.aggressive_driving_policy(EGO_INDEX)
@@ -366,6 +363,7 @@ class ChangeLane(ManeuverBase):
         self._reward_in_goal = 200
         self._violation_penalty_in_low_level_training = 150
         self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True
 
     def generate_validation_scenario(self):
         self.generate_scenario(
@@ -396,20 +394,17 @@ class Follow(ManeuverBase):
             LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range))
 
         self._LTL_preconditions.append(
-            LTLProperty(
-                "G ( (lane and target_lane) or (not lane and not target_lane) )",
-                self._penalty_for_change_lane))
+             LTLProperty(
+                 "G ( (lane and target_lane) or (not lane and not target_lane) )",
+                 self._penalty_for_change_lane))
 
-        #self._LTL_preconditions.append(
+        # self._LTL_preconditions.append(
         #    LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200,
         #                self._enable_low_level_training_properties))
 
         self._LTL_preconditions.append(
-            LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
-                        self._enable_low_level_training_properties))
-
-        self._LTL_preconditions.append(
-            LTLProperty("G ( not in_stop_region)", 0, not self._enable_low_level_training_properties))
+           LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
+                       self._enable_low_level_training_properties))
 
     def generate_learning_scenario(self):
         self.generate_scenario(
@@ -422,6 +417,7 @@ class Follow(ManeuverBase):
         self._penalty_for_out_of_range = 200
         self._penalty_for_change_lane = 170
         self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True
 
     def generate_validation_scenario(self):
         self.generate_learning_scenario()
@@ -434,12 +430,12 @@ class Follow(ManeuverBase):
 
     def _set_v_ref(self):
         #if self._enable_low_level_training_properties:
-            self._target_veh_i, _ = self.env.get_V2V_distance()
+        self._target_veh_i, _ = self.env.get_V2V_distance()
 
-            if self._target_veh_i is not None:
-                self._v_ref = self.env.vehs[self._target_veh_i].v
-            else:
-                self._v_ref = 0
+        if self._target_veh_i is not None:
+            self._v_ref = self.env.vehs[self._target_veh_i].v
+        else:
+            self._v_ref = 0
         #else:
         #    self._v_ref = rd.speed_limit
 
@@ -448,19 +444,17 @@ class Follow(ManeuverBase):
 
     @property
     def extra_termination_condition(self):
-        APs = self.env.ego.APs
+        # APs = self.env.ego.APs
 
         if self._target_veh_i is None:
             return False
 
-        elif not self._enable_low_level_training_properties:  # activated only for the high-level training.
-            if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
-                and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
-                     self.env.vehs[self._target_veh_i].x > 0):
-                return True
-            else:
-                return False
-
+        #elif not self._enable_low_level_training_properties:  # activated only for the high-level training.
+        # if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
+        #     and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
+        #          self.env.vehs[self._target_veh_i].x > 0):
+        #     return True
+        # else:
         return False
 
     def _features_dim_reduction(self, features_tuple):
-- 
GitLab