Fix the bug of having -100000 or 10000 rewards sometime.

7dc47bce · Jae Young Lee · 4a9327bd · 7dc47bce · 7dc47bce · 7dc47bce
Commit 7dc47bce authored 6 years ago by Jae Young Lee
--- a/backends/controller_base.py
+++ b/backends/controller_base.py
@@ -65,7 +65,7 @@ class ControllerBase(PolicyBase):
        total_reward += self.current_node.high_level_extra_reward

        # TODO for info
-        return observation, total_reward, self.env.termination_condition, info
+        return observation, total_reward, terminal, info

    # TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
    ''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition
@@ -76,9 +76,7 @@ class ControllerBase(PolicyBase):

    def low_level_step_current_node(self):

-        u_ego = self.current_node.low_level_policy(
-            self.current_node.get_reduced_features_tuple())
+        u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple())
        feature, R, terminal, info = self.current_node.step(u_ego)
        self.node_terminal_state_reached = terminal
-        return self.env.get_features_tuple(
-        ), R, self.env.termination_condition, info
+        return self.env.get_features_tuple(), R, self.env.termination_condition, info
--- a/backends/kerasrl_learner.py
+++ b/backends/kerasrl_learner.py
@@ -317,6 +317,7 @@ class DQNLearner(LearnerBase):
            nb_steps_warmup=self.nb_steps_warmup,
            target_model_update=self.target_model_update,
            policy=policy,
+            test_policy=test_policy,
            enable_dueling_network=True)

        agent.compile(Adam(lr=self.lr), metrics=['mae'])
@@ -327,6 +328,8 @@ class DQNLearner(LearnerBase):
              env,
              nb_steps=1000000,
              visualize=False,
+              verbose=1,
+              log_interval=10000,
              nb_max_episode_steps=200,
              tensorboard=False,
              model_checkpoints=False,
@@ -346,7 +349,8 @@ class DQNLearner(LearnerBase):
            env,
            nb_steps=nb_steps,
            visualize=visualize,
-            verbose=1,
+            verbose=verbose,
+            log_interval=log_interval,
            nb_max_episode_steps=nb_max_episode_steps,
            callbacks=callbacks)

@@ -479,21 +483,13 @@ class RestrictedGreedyQPolicy(GreedyQPolicy):
            Selection action
        """
        assert q_values.ndim == 1
-        nb_actions = q_values.shape[0]
-        restricted_q_values = list()
-
-        for i in range(0, nb_actions):
-            if q_values[i] != -np.inf:
-                restricted_q_values.append(q_values[i])

-        # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions
-        # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function.
-        # In this case, we choose a policy randomly.
-        if len(restricted_q_values) < 1:
-            action = np.random.random_integers(0, nb_actions - 1)
+        if np.max(q_values) == - np.inf:
+            # every q_value is -np.inf, we choose action = 0
+            action = 0

        else:
-            action = np.argmax(restricted_q_values)
+            action = np.argmax(q_values)

        return action

@@ -521,6 +517,7 @@ class DQNAgentOverOptions(DQNAgent):
    def __get_invalid_node_indices(self):
        """Returns a list of option indices that are invalid according to
        initiation conditions."""
+
        invalid_node_indices = list()
        for index, option_alias in enumerate(self.low_level_policy_aliases):
            # TODO: Locate reset_maneuver to another place as this is a "get" function.

--- a/config.json
+++ b/config.json
@@ -3,7 +3,7 @@
 		"wait": "Wait",
 		"follow": "Follow",
 		"stop": "Stop",
-		"immediatestop": "ImmediateStop",
+		"changelane": "ChangeLane",
 		"keeplane": "KeepLane"
 	},


--- a/env/env_base.py
+++ b/env/env_base.py
@@ -157,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase):
        if self._terminate_in_goal and self.goal_achieved:
            return True

-        return self.violation_happened and self._LTL_preconditions_enable
+        return self.violation_happened

    @property
    def goal_achieved(self):

--- a/env/simple_intersection/road_geokinemetry.py
+++ b/env/simple_intersection/road_geokinemetry.py
@@ -93,7 +93,7 @@ vlanes = Route(
    [-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35,
    [-4.0, 4.0])

-intersection_width = vlanes.n_lanes * vlanes.width
+intersection_width =  vlanes.n_lanes * vlanes.width
 intersection_height = hlanes.n_lanes * hlanes.width

 intersection_width_w_offset = intersection_width + 2 * intersection_hoffset

--- a/env/simple_intersection/simple_intersection_env.py
+++ b/env/simple_intersection/simple_intersection_env.py
@@ -51,7 +51,9 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase):

    #: The weight vector to calculate the cost. In the maneuver, cost_weights
    # can be set to a specific value which may be different than the default.
-    cost_weights = (1.0, 0.25, 0.1, 1.0, 100.0, 0.1, 0.25, 0.1)
+    # TODO: check _cost_weights in both here and ManeuverBase. The _cost_weights has to be substituted to here, but it doesn't sometimes.
+    _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
+                     100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)

    #TODO: Move this to constants
    # The empirical min and max of each term in the cost vector, which is used to normalize the values

--- a/options/options_loader.py
+++ b/options/options_loader.py
@@ -180,11 +180,6 @@ class OptionsGraph:
                print("\n Warning: the trained low-level policy of \"" + key +
                      "\" does not exists; the manual policy will be used.\n")

-            # setting the cost weights for high-level policy training.
-            # TODO: this shouldn't be initialized here, but within ManeuverBase class (e.g. make some flag indicating high-level training and set the weights)...
-            maneuver._cost_weights = (100.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
-                                     100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
-
            if self.config["method"] == "mcts":
                maneuver.timeout = np.inf


--- a/options/simple_intersection/maneuver_base.py
+++ b/options/simple_intersection/maneuver_base.py
@@ -30,7 +30,7 @@ class ManeuverBase(EpisodicEnvBase):
    # as a negative reward, so a cost will be summed up to the reward
    # with subtraction.
    _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
-                     100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
+                     100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)

    _extra_r_terminal = None
    _extra_r_on_timeout = None
@@ -38,7 +38,7 @@ class ManeuverBase(EpisodicEnvBase):
    #: the flag being False when _cost_weights is used without
    # modification; If True, then the action parts of _cost_weights
    # are increased for some edge cases (see the step method).
-    _extra_action_weights_flag = True
+    _extra_action_weights_flag = False

    #: the extra weights on the actions added to _cost_weights
    # for some edge cases when _extra_action_weights_flag = True.

--- a/options/simple_intersection/maneuvers.py
+++ b/options/simple_intersection/maneuvers.py
@@ -15,11 +15,10 @@ class KeepLane(ManeuverBase):
        self._target_lane = self.env.ego.APs['lane']

    def _init_LTL_preconditions(self):
-        self._LTL_preconditions.append(LTLProperty("G ( not veh_ahead )", 0))

-        #self._LTL_preconditions.append(
-        #    LTLProperty("G ( not stopped_now )", 100,
-        #                self._enable_low_level_training_properties))
+        self._LTL_preconditions.append(
+            LTLProperty("G ( not stopped_now )", 100,
+                        self._enable_low_level_training_properties))

        self._LTL_preconditions.append(
            LTLProperty(
@@ -37,6 +36,7 @@ class KeepLane(ManeuverBase):
        self.env._terminate_in_goal = False
        self.env._reward_in_goal = None
        self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True

    def generate_validation_scenario(self):
        self.generate_scenario(
@@ -65,7 +65,7 @@ class KeepLane(ManeuverBase):
        return False


-class ImmediateStop(ManeuverBase):
+class Halt(ManeuverBase):

    _terminate_in_goal = True
    _reward_in_goal = None
@@ -102,6 +102,7 @@ class ImmediateStop(ManeuverBase):
        self.env._reward_in_goal = None
        self._reward_in_goal = 200
        self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True

    def generate_validation_scenario(self):
        self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos)
@@ -147,14 +148,11 @@ class Stop(ManeuverBase):
        self._target_lane = self.env.ego.APs['lane']

    def _init_LTL_preconditions(self):
-        #self._LTL_preconditions.append(
-        #    LTLProperty("G ( not has_stopped_in_stop_region )",
-        #                self._penalty(self._reward_in_goal)))
-
        self._LTL_preconditions.append(
-            LTLProperty("G ( not has_stopped_in_stop_region )", -150,
-                        not self._enable_low_level_training_properties))
+            LTLProperty("G ( not has_stopped_in_stop_region )",
+                        self._penalty(self._reward_in_goal), not self._enable_low_level_training_properties))

+        # before_intersection rather than "before_but_close_to_stop_region or in_stop_region"?
        self._LTL_preconditions.append(
            LTLProperty(
                "G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )",
@@ -191,6 +189,7 @@ class Stop(ManeuverBase):
        self._reward_in_goal = 200
        self._penalty_in_violation = 150
        self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True

    def _low_level_manual_policy(self):
        return self.env.aggressive_driving_policy(EGO_INDEX)
@@ -234,27 +233,21 @@ class Wait(ManeuverBase):
    def _init_LTL_preconditions(self):

        self._LTL_preconditions.append(
+            LTLProperty("G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))",
+                        None, not self._enable_low_level_training_properties))  # not available in low-level training...

-        LTLProperty(
-            "G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", 0,
-            not self._enable_low_level_training_properties))  # not available in low-level training...
-
-        #LTLProperty(
-        #    "G ( (in_stop_region and stopped_now) U (highest_priority and intersection_is_clear))", 0,
-        #    not self._enable_low_level_training_properties))  # not available in low-level training...
-
-        #self._LTL_preconditions.append(
-        #    LTLProperty("G ( not (in_intersection and highest_priority) )",
-        #                self._penalty(self._reward_in_goal)))
+        self._LTL_preconditions.append(
+            LTLProperty("G ( not (in_intersection and highest_priority) )",
+                        self._penalty(self._reward_in_goal)))

-        #self._LTL_preconditions.append(
-        #    LTLProperty(
-        #        "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))
+        self._LTL_preconditions.append(
+           LTLProperty(
+               "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))

-        #self._LTL_preconditions.append(
-        #    LTLProperty(
-        #        "G ( (lane and target_lane) or (not lane and not target_lane) )",
-        #        100, self._enable_low_level_training_properties))
+        self._LTL_preconditions.append(
+           LTLProperty(
+               "G ( (lane and target_lane) or (not lane and not target_lane) )",
+               100, self._enable_low_level_training_properties))

    def _init_param(self):
        self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit
@@ -324,7 +317,7 @@ class ChangeLane(ManeuverBase):

    _violation_penalty_in_low_level_training = None

-    # high_level_extra_reward = -1000000
+    high_level_extra_reward = -50

    def _init_param(self):
        self._v_ref = rd.speed_limit
@@ -332,13 +325,17 @@ class ChangeLane(ManeuverBase):

    def _init_LTL_preconditions(self):
        self._LTL_preconditions.append(
-            LTLProperty("G ( on_route and not over_speed_limit )",
-                        self._violation_penalty_in_low_level_training,
-                        self._enable_low_level_training_properties))
+           LTLProperty("G ( on_route and not over_speed_limit )",
+                       self._violation_penalty_in_low_level_training,
+                       self._enable_low_level_training_properties))

        self._LTL_preconditions.append(
-            LTLProperty("G ( not stopped_now )",
-                        100, self._enable_low_level_training_properties))
+           LTLProperty("G ( not stopped_now )",
+                       100, self._enable_low_level_training_properties))
+
+        self._LTL_preconditions.append(
+           LTLProperty("G ( not in_intersection and not in_stop_region )",
+                       None, not self._enable_low_level_training_properties))  # activated only for the high-level case.

    @property
    def goal_achieved(self):
@@ -346,9 +343,9 @@ class ChangeLane(ManeuverBase):
        APs = self.env.ego.APs
        on_other_lane = APs['lane'] == self._target_lane
        achieved_y_displacement = np.sign(ego.y) * \
-                                  (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
+                                   (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
        return on_other_lane and APs['on_route'] and \
-               achieved_y_displacement and APs['parallel_to_lane']
+            achieved_y_displacement and APs['parallel_to_lane']

    def _low_level_manual_policy(self):
        return self.env.aggressive_driving_policy(EGO_INDEX)
@@ -366,6 +363,7 @@ class ChangeLane(ManeuverBase):
        self._reward_in_goal = 200
        self._violation_penalty_in_low_level_training = 150
        self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True

    def generate_validation_scenario(self):
        self.generate_scenario(
@@ -396,20 +394,17 @@ class Follow(ManeuverBase):
            LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range))

        self._LTL_preconditions.append(
-            LTLProperty(
-                "G ( (lane and target_lane) or (not lane and not target_lane) )",
-                self._penalty_for_change_lane))
+             LTLProperty(
+                 "G ( (lane and target_lane) or (not lane and not target_lane) )",
+                 self._penalty_for_change_lane))

-        #self._LTL_preconditions.append(
+        # self._LTL_preconditions.append(
        #    LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200,
        #                self._enable_low_level_training_properties))

        self._LTL_preconditions.append(
-            LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
-                        self._enable_low_level_training_properties))
-
-        self._LTL_preconditions.append(
-            LTLProperty("G ( not in_stop_region)", 0, not self._enable_low_level_training_properties))
+           LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
+                       self._enable_low_level_training_properties))

    def generate_learning_scenario(self):
        self.generate_scenario(
@@ -422,6 +417,7 @@ class Follow(ManeuverBase):
        self._penalty_for_out_of_range = 200
        self._penalty_for_change_lane = 170
        self._enable_low_level_training_properties = True
+        self._extra_action_weights_flag = True

    def generate_validation_scenario(self):
        self.generate_learning_scenario()
@@ -434,12 +430,12 @@ class Follow(ManeuverBase):

    def _set_v_ref(self):
        #if self._enable_low_level_training_properties:
-            self._target_veh_i, _ = self.env.get_V2V_distance()
+        self._target_veh_i, _ = self.env.get_V2V_distance()

-            if self._target_veh_i is not None:
-                self._v_ref = self.env.vehs[self._target_veh_i].v
-            else:
-                self._v_ref = 0
+        if self._target_veh_i is not None:
+            self._v_ref = self.env.vehs[self._target_veh_i].v
+        else:
+            self._v_ref = 0
        #else:
        #    self._v_ref = rd.speed_limit

@@ -448,19 +444,17 @@ class Follow(ManeuverBase):

    @property
    def extra_termination_condition(self):
-        APs = self.env.ego.APs
+        # APs = self.env.ego.APs

        if self._target_veh_i is None:
            return False

-        elif not self._enable_low_level_training_properties:  # activated only for the high-level training.
-            if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
-                and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
-                     self.env.vehs[self._target_veh_i].x > 0):
-                return True
-            else:
-                return False
-
+        #elif not self._enable_low_level_training_properties:  # activated only for the high-level training.
+        # if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
+        #     and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
+        #          self.env.vehs[self._target_veh_i].x > 0):
+        #     return True
+        # else:
        return False

    def _features_dim_reduction(self, features_tuple):