From 6da696a294dd8709487777cfb269089ede61f95c Mon Sep 17 00:00:00 2001 From: Jaeyoung Lee <jaeyoung.lee@uwaterloo.ca> Date: Fri, 25 Jan 2019 00:02:00 -0500 Subject: [PATCH] Fix the bug of having -100000 or 10000 rewards sometime. --- backends/controller_base.py | 8 +- backends/kerasrl_learner.py | 33 ++--- .../immediatestop_weights_actor.h5f | Bin .../immediatestop_weights_critic.h5f | Bin config.json | 2 +- env/env_base.py | 2 +- env/simple_intersection/road_geokinemetry.py | 2 +- .../simple_intersection_env.py | 7 +- options/options_loader.py | 5 - options/simple_intersection/maneuver_base.py | 5 +- options/simple_intersection/maneuvers.py | 114 +++++++++--------- 11 files changed, 85 insertions(+), 93 deletions(-) rename backends/trained_policies/{immediatestop => halt}/immediatestop_weights_actor.h5f (100%) rename backends/trained_policies/{immediatestop => halt}/immediatestop_weights_critic.h5f (100%) diff --git a/backends/controller_base.py b/backends/controller_base.py index 1617036..ad31856 100644 --- a/backends/controller_base.py +++ b/backends/controller_base.py @@ -65,7 +65,7 @@ class ControllerBase(PolicyBase): total_reward += self.current_node.high_level_extra_reward # TODO for info - return observation, total_reward, self.env.termination_condition, info + return observation, total_reward, terminal, info # TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean ''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition @@ -76,9 +76,7 @@ class ControllerBase(PolicyBase): def low_level_step_current_node(self): - u_ego = self.current_node.low_level_policy( - self.current_node.get_reduced_features_tuple()) + u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple()) feature, R, terminal, info = self.current_node.step(u_ego) self.node_terminal_state_reached = terminal - return self.env.get_features_tuple( - ), R, self.env.termination_condition, info + return self.env.get_features_tuple(), R, self.env.termination_condition, info diff --git a/backends/kerasrl_learner.py b/backends/kerasrl_learner.py index 3d79734..0da901b 100644 --- a/backends/kerasrl_learner.py +++ b/backends/kerasrl_learner.py @@ -237,7 +237,7 @@ class DQNLearner(LearnerBase): input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given - policy: KerasRL Policy. Uses default MaxBoltzmannQPolicy if not given + policy: KerasRL Policy. Uses default RestrictedEpsGreedyQPolicy if not given memory: KerasRL Memory. Uses default SequentialMemory if not given **kwargs: other optional key-value arguments with defaults defined in property_defaults """ @@ -317,6 +317,7 @@ class DQNLearner(LearnerBase): nb_steps_warmup=self.nb_steps_warmup, target_model_update=self.target_model_update, policy=policy, + test_policy=test_policy, enable_dueling_network=True) agent.compile(Adam(lr=self.lr), metrics=['mae']) @@ -327,6 +328,8 @@ class DQNLearner(LearnerBase): env, nb_steps=1000000, visualize=False, + verbose=1, + log_interval=10000, nb_max_episode_steps=200, tensorboard=False, model_checkpoints=False, @@ -346,7 +349,8 @@ class DQNLearner(LearnerBase): env, nb_steps=nb_steps, visualize=visualize, - verbose=1, + verbose=verbose, + log_interval=log_interval, nb_max_episode_steps=nb_max_episode_steps, callbacks=callbacks) @@ -448,9 +452,11 @@ class RestrictedEpsGreedyQPolicy(EpsGreedyQPolicy): # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. - # In this case, we choose a policy randomly. + # TODO: exception process or some more process to choose action in this exceptional case. if len(index) < 1: - action = np.random.random_integers(0, nb_actions - 1) + # every q_value is -np.inf, we choose action = 0 + action = 0 + print("Warning: no action satisfies initiation condition, action = 0 is chosen by default.") elif np.random.uniform() <= self.eps: action = index[np.random.random_integers(0, len(index) - 1)] @@ -479,21 +485,15 @@ class RestrictedGreedyQPolicy(GreedyQPolicy): Selection action """ assert q_values.ndim == 1 - nb_actions = q_values.shape[0] - restricted_q_values = list() - - for i in range(0, nb_actions): - if q_values[i] != -np.inf: - restricted_q_values.append(q_values[i]) - # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions - # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. - # In this case, we choose a policy randomly. - if len(restricted_q_values) < 1: - action = np.random.random_integers(0, nb_actions - 1) + # TODO: exception process or some more process to choose action in this exceptional case. + if np.max(q_values) == - np.inf: + # every q_value is -np.inf, we choose action = 0 + action = 0 + print("Warning: no action satisfies initiation condition, action = 0 is chosen by default.") else: - action = np.argmax(restricted_q_values) + action = np.argmax(q_values) return action @@ -521,6 +521,7 @@ class DQNAgentOverOptions(DQNAgent): def __get_invalid_node_indices(self): """Returns a list of option indices that are invalid according to initiation conditions.""" + invalid_node_indices = list() for index, option_alias in enumerate(self.low_level_policy_aliases): # TODO: Locate reset_maneuver to another place as this is a "get" function. diff --git a/backends/trained_policies/immediatestop/immediatestop_weights_actor.h5f b/backends/trained_policies/halt/immediatestop_weights_actor.h5f similarity index 100% rename from backends/trained_policies/immediatestop/immediatestop_weights_actor.h5f rename to backends/trained_policies/halt/immediatestop_weights_actor.h5f diff --git a/backends/trained_policies/immediatestop/immediatestop_weights_critic.h5f b/backends/trained_policies/halt/immediatestop_weights_critic.h5f similarity index 100% rename from backends/trained_policies/immediatestop/immediatestop_weights_critic.h5f rename to backends/trained_policies/halt/immediatestop_weights_critic.h5f diff --git a/config.json b/config.json index 5e40c77..b8d9b1c 100644 --- a/config.json +++ b/config.json @@ -3,7 +3,7 @@ "wait": "Wait", "follow": "Follow", "stop": "Stop", - "immediatestop": "ImmediateStop", + "changelane": "ChangeLane", "keeplane": "KeepLane" }, diff --git a/env/env_base.py b/env/env_base.py index 4d28722..038d59a 100644 --- a/env/env_base.py +++ b/env/env_base.py @@ -157,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase): if self._terminate_in_goal and self.goal_achieved: return True - return self.violation_happened and self._LTL_preconditions_enable + return self.violation_happened @property def goal_achieved(self): diff --git a/env/simple_intersection/road_geokinemetry.py b/env/simple_intersection/road_geokinemetry.py index 63190be..602b3e5 100644 --- a/env/simple_intersection/road_geokinemetry.py +++ b/env/simple_intersection/road_geokinemetry.py @@ -93,7 +93,7 @@ vlanes = Route( [-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35, [-4.0, 4.0]) -intersection_width = vlanes.n_lanes * vlanes.width +intersection_width = vlanes.n_lanes * vlanes.width intersection_height = hlanes.n_lanes * hlanes.width intersection_width_w_offset = intersection_width + 2 * intersection_hoffset diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py index 32f50ef..bd34483 100644 --- a/env/simple_intersection/simple_intersection_env.py +++ b/env/simple_intersection/simple_intersection_env.py @@ -51,7 +51,10 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): #: The weight vector to calculate the cost. In the maneuver, cost_weights # can be set to a specific value which may be different than the default. - cost_weights = (1.0, 0.25, 0.1, 1.0, 100.0, 0.1, 0.25, 0.1) + # TODO: check _cost_weights in both here and ManeuverBase. The _cost_weights has to be substituted to here, but it doesn't sometimes. + # TODO: set a functionality of setting _cost_weights for low and high level training separately. + _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, + 100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3) #TODO: Move this to constants # The empirical min and max of each term in the cost vector, which is used to normalize the values @@ -271,7 +274,7 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): # stopped_car_scenario = bool(np.random.randint(0, 1)) TODO: this scenario may not work n_others_stopped_in_stop_region = np.random.randint( 0, min(3, n_others - stopped_car_scenario)) - veh_ahead_scenario = bool(np.random.randint(0, 1)) or veh_ahead_scenario + veh_ahead_scenario = bool(np.random.randint(0, 2)) or veh_ahead_scenario if n_others_stopped_in_stop_region > min( n_others - stopped_car_scenario, 3): diff --git a/options/options_loader.py b/options/options_loader.py index bce1a6c..f4f7386 100644 --- a/options/options_loader.py +++ b/options/options_loader.py @@ -180,11 +180,6 @@ class OptionsGraph: print("\n Warning: the trained low-level policy of \"" + key + "\" does not exists; the manual policy will be used.\n") - # setting the cost weights for high-level policy training. - # TODO: this shouldn't be initialized here, but within ManeuverBase class (e.g. make some flag indicating high-level training and set the weights)... - maneuver._cost_weights = (100.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, - 100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3) - if self.config["method"] == "mcts": maneuver.timeout = np.inf diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py index fbbc3ed..01e210d 100644 --- a/options/simple_intersection/maneuver_base.py +++ b/options/simple_intersection/maneuver_base.py @@ -29,8 +29,9 @@ class ManeuverBase(EpisodicEnvBase): # _extra_action_weights_flag = True); note that a cost is defined # as a negative reward, so a cost will be summed up to the reward # with subtraction. + # TODO: remove or to provide additional functionality, keep _cost_weights in ManeuverBase here (see other TODOs in simple_intersection_env regarding _cost_weights). _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, - 100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3) + 100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3) _extra_r_terminal = None _extra_r_on_timeout = None @@ -38,7 +39,7 @@ class ManeuverBase(EpisodicEnvBase): #: the flag being False when _cost_weights is used without # modification; If True, then the action parts of _cost_weights # are increased for some edge cases (see the step method). - _extra_action_weights_flag = True + _extra_action_weights_flag = False #: the extra weights on the actions added to _cost_weights # for some edge cases when _extra_action_weights_flag = True. diff --git a/options/simple_intersection/maneuvers.py b/options/simple_intersection/maneuvers.py index 570b4ac..67c8318 100644 --- a/options/simple_intersection/maneuvers.py +++ b/options/simple_intersection/maneuvers.py @@ -15,11 +15,10 @@ class KeepLane(ManeuverBase): self._target_lane = self.env.ego.APs['lane'] def _init_LTL_preconditions(self): - self._LTL_preconditions.append(LTLProperty("G ( not veh_ahead )", 0)) - #self._LTL_preconditions.append( - # LTLProperty("G ( not stopped_now )", 100, - # self._enable_low_level_training_properties)) + self._LTL_preconditions.append( + LTLProperty("G ( not stopped_now )", 100, + self._enable_low_level_training_properties)) self._LTL_preconditions.append( LTLProperty( @@ -37,6 +36,7 @@ class KeepLane(ManeuverBase): self.env._terminate_in_goal = False self.env._reward_in_goal = None self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True def generate_validation_scenario(self): self.generate_scenario( @@ -65,7 +65,7 @@ class KeepLane(ManeuverBase): return False -class ImmediateStop(ManeuverBase): +class Halt(ManeuverBase): _terminate_in_goal = True _reward_in_goal = None @@ -102,6 +102,7 @@ class ImmediateStop(ManeuverBase): self.env._reward_in_goal = None self._reward_in_goal = 200 self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True def generate_validation_scenario(self): self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos) @@ -147,14 +148,11 @@ class Stop(ManeuverBase): self._target_lane = self.env.ego.APs['lane'] def _init_LTL_preconditions(self): - #self._LTL_preconditions.append( - # LTLProperty("G ( not has_stopped_in_stop_region )", - # self._penalty(self._reward_in_goal))) - self._LTL_preconditions.append( - LTLProperty("G ( not has_stopped_in_stop_region )", -150, - not self._enable_low_level_training_properties)) + LTLProperty("G ( not has_stopped_in_stop_region )", + self._penalty(self._reward_in_goal), not self._enable_low_level_training_properties)) + # before_intersection rather than "before_but_close_to_stop_region or in_stop_region"? self._LTL_preconditions.append( LTLProperty( "G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )", @@ -191,6 +189,7 @@ class Stop(ManeuverBase): self._reward_in_goal = 200 self._penalty_in_violation = 150 self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True def _low_level_manual_policy(self): return self.env.aggressive_driving_policy(EGO_INDEX) @@ -234,27 +233,21 @@ class Wait(ManeuverBase): def _init_LTL_preconditions(self): self._LTL_preconditions.append( + LTLProperty("G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", + None, not self._enable_low_level_training_properties)) # not available in low-level training... - LTLProperty( - "G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", 0, - not self._enable_low_level_training_properties)) # not available in low-level training... - - #LTLProperty( - # "G ( (in_stop_region and stopped_now) U (highest_priority and intersection_is_clear))", 0, - # not self._enable_low_level_training_properties)) # not available in low-level training... - - #self._LTL_preconditions.append( - # LTLProperty("G ( not (in_intersection and highest_priority) )", - # self._penalty(self._reward_in_goal))) + self._LTL_preconditions.append( + LTLProperty("G ( not (in_intersection and highest_priority) )", + self._penalty(self._reward_in_goal))) - #self._LTL_preconditions.append( - # LTLProperty( - # "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties)) + self._LTL_preconditions.append( + LTLProperty( + "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties)) - #self._LTL_preconditions.append( - # LTLProperty( - # "G ( (lane and target_lane) or (not lane and not target_lane) )", - # 100, self._enable_low_level_training_properties)) + self._LTL_preconditions.append( + LTLProperty( + "G ( (lane and target_lane) or (not lane and not target_lane) )", + 100, self._enable_low_level_training_properties)) def _init_param(self): self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit @@ -324,7 +317,7 @@ class ChangeLane(ManeuverBase): _violation_penalty_in_low_level_training = None - # high_level_extra_reward = -1000000 + high_level_extra_reward = -50 def _init_param(self): self._v_ref = rd.speed_limit @@ -332,13 +325,17 @@ class ChangeLane(ManeuverBase): def _init_LTL_preconditions(self): self._LTL_preconditions.append( - LTLProperty("G ( on_route and not over_speed_limit )", - self._violation_penalty_in_low_level_training, - self._enable_low_level_training_properties)) + LTLProperty("G ( on_route and not over_speed_limit )", + self._violation_penalty_in_low_level_training, + self._enable_low_level_training_properties)) self._LTL_preconditions.append( - LTLProperty("G ( not stopped_now )", - 100, self._enable_low_level_training_properties)) + LTLProperty("G ( not stopped_now )", + 100, self._enable_low_level_training_properties)) + + self._LTL_preconditions.append( + LTLProperty("G ( not in_intersection and not in_stop_region )", + None, not self._enable_low_level_training_properties)) # activated only for the high-level case. @property def goal_achieved(self): @@ -346,9 +343,9 @@ class ChangeLane(ManeuverBase): APs = self.env.ego.APs on_other_lane = APs['lane'] == self._target_lane achieved_y_displacement = np.sign(ego.y) * \ - (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance + (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance return on_other_lane and APs['on_route'] and \ - achieved_y_displacement and APs['parallel_to_lane'] + achieved_y_displacement and APs['parallel_to_lane'] def _low_level_manual_policy(self): return self.env.aggressive_driving_policy(EGO_INDEX) @@ -366,6 +363,7 @@ class ChangeLane(ManeuverBase): self._reward_in_goal = 200 self._violation_penalty_in_low_level_training = 150 self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True def generate_validation_scenario(self): self.generate_scenario( @@ -396,20 +394,17 @@ class Follow(ManeuverBase): LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range)) self._LTL_preconditions.append( - LTLProperty( - "G ( (lane and target_lane) or (not lane and not target_lane) )", - self._penalty_for_change_lane)) + LTLProperty( + "G ( (lane and target_lane) or (not lane and not target_lane) )", + self._penalty_for_change_lane)) - #self._LTL_preconditions.append( + # self._LTL_preconditions.append( # LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200, # self._enable_low_level_training_properties)) self._LTL_preconditions.append( - LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range, - self._enable_low_level_training_properties)) - - self._LTL_preconditions.append( - LTLProperty("G ( not in_stop_region)", 0, not self._enable_low_level_training_properties)) + LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range, + self._enable_low_level_training_properties)) def generate_learning_scenario(self): self.generate_scenario( @@ -422,6 +417,7 @@ class Follow(ManeuverBase): self._penalty_for_out_of_range = 200 self._penalty_for_change_lane = 170 self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True def generate_validation_scenario(self): self.generate_learning_scenario() @@ -434,12 +430,12 @@ class Follow(ManeuverBase): def _set_v_ref(self): #if self._enable_low_level_training_properties: - self._target_veh_i, _ = self.env.get_V2V_distance() + self._target_veh_i, _ = self.env.get_V2V_distance() - if self._target_veh_i is not None: - self._v_ref = self.env.vehs[self._target_veh_i].v - else: - self._v_ref = 0 + if self._target_veh_i is not None: + self._v_ref = self.env.vehs[self._target_veh_i].v + else: + self._v_ref = 0 #else: # self._v_ref = rd.speed_limit @@ -448,19 +444,17 @@ class Follow(ManeuverBase): @property def extra_termination_condition(self): - APs = self.env.ego.APs + # APs = self.env.ego.APs if self._target_veh_i is None: return False - elif not self._enable_low_level_training_properties: # activated only for the high-level training. - if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \ - and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or - self.env.vehs[self._target_veh_i].x > 0): - return True - else: - return False - + #elif not self._enable_low_level_training_properties: # activated only for the high-level training. + # if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \ + # and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or + # self.env.vehs[self._target_veh_i].x > 0): + # return True + # else: return False def _features_dim_reduction(self, features_tuple): -- GitLab