Skip to content
Snippets Groups Projects
Commit 7dc47bce authored by Jae Young Lee's avatar Jae Young Lee
Browse files

Fix the bug of having -100000 or 10000 rewards sometime.

parent 4a9327bd
No related branches found
No related tags found
No related merge requests found
...@@ -65,7 +65,7 @@ class ControllerBase(PolicyBase): ...@@ -65,7 +65,7 @@ class ControllerBase(PolicyBase):
total_reward += self.current_node.high_level_extra_reward total_reward += self.current_node.high_level_extra_reward
# TODO for info # TODO for info
return observation, total_reward, self.env.termination_condition, info return observation, total_reward, terminal, info
# TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean # TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition ''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition
...@@ -76,9 +76,7 @@ class ControllerBase(PolicyBase): ...@@ -76,9 +76,7 @@ class ControllerBase(PolicyBase):
def low_level_step_current_node(self): def low_level_step_current_node(self):
u_ego = self.current_node.low_level_policy( u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple())
self.current_node.get_reduced_features_tuple())
feature, R, terminal, info = self.current_node.step(u_ego) feature, R, terminal, info = self.current_node.step(u_ego)
self.node_terminal_state_reached = terminal self.node_terminal_state_reached = terminal
return self.env.get_features_tuple( return self.env.get_features_tuple(), R, self.env.termination_condition, info
), R, self.env.termination_condition, info
...@@ -317,6 +317,7 @@ class DQNLearner(LearnerBase): ...@@ -317,6 +317,7 @@ class DQNLearner(LearnerBase):
nb_steps_warmup=self.nb_steps_warmup, nb_steps_warmup=self.nb_steps_warmup,
target_model_update=self.target_model_update, target_model_update=self.target_model_update,
policy=policy, policy=policy,
test_policy=test_policy,
enable_dueling_network=True) enable_dueling_network=True)
agent.compile(Adam(lr=self.lr), metrics=['mae']) agent.compile(Adam(lr=self.lr), metrics=['mae'])
...@@ -327,6 +328,8 @@ class DQNLearner(LearnerBase): ...@@ -327,6 +328,8 @@ class DQNLearner(LearnerBase):
env, env,
nb_steps=1000000, nb_steps=1000000,
visualize=False, visualize=False,
verbose=1,
log_interval=10000,
nb_max_episode_steps=200, nb_max_episode_steps=200,
tensorboard=False, tensorboard=False,
model_checkpoints=False, model_checkpoints=False,
...@@ -346,7 +349,8 @@ class DQNLearner(LearnerBase): ...@@ -346,7 +349,8 @@ class DQNLearner(LearnerBase):
env, env,
nb_steps=nb_steps, nb_steps=nb_steps,
visualize=visualize, visualize=visualize,
verbose=1, verbose=verbose,
log_interval=log_interval,
nb_max_episode_steps=nb_max_episode_steps, nb_max_episode_steps=nb_max_episode_steps,
callbacks=callbacks) callbacks=callbacks)
...@@ -479,21 +483,13 @@ class RestrictedGreedyQPolicy(GreedyQPolicy): ...@@ -479,21 +483,13 @@ class RestrictedGreedyQPolicy(GreedyQPolicy):
Selection action Selection action
""" """
assert q_values.ndim == 1 assert q_values.ndim == 1
nb_actions = q_values.shape[0]
restricted_q_values = list()
for i in range(0, nb_actions):
if q_values[i] != -np.inf:
restricted_q_values.append(q_values[i])
# every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions if np.max(q_values) == - np.inf:
# of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. # every q_value is -np.inf, we choose action = 0
# In this case, we choose a policy randomly. action = 0
if len(restricted_q_values) < 1:
action = np.random.random_integers(0, nb_actions - 1)
else: else:
action = np.argmax(restricted_q_values) action = np.argmax(q_values)
return action return action
...@@ -521,6 +517,7 @@ class DQNAgentOverOptions(DQNAgent): ...@@ -521,6 +517,7 @@ class DQNAgentOverOptions(DQNAgent):
def __get_invalid_node_indices(self): def __get_invalid_node_indices(self):
"""Returns a list of option indices that are invalid according to """Returns a list of option indices that are invalid according to
initiation conditions.""" initiation conditions."""
invalid_node_indices = list() invalid_node_indices = list()
for index, option_alias in enumerate(self.low_level_policy_aliases): for index, option_alias in enumerate(self.low_level_policy_aliases):
# TODO: Locate reset_maneuver to another place as this is a "get" function. # TODO: Locate reset_maneuver to another place as this is a "get" function.
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
"wait": "Wait", "wait": "Wait",
"follow": "Follow", "follow": "Follow",
"stop": "Stop", "stop": "Stop",
"immediatestop": "ImmediateStop", "changelane": "ChangeLane",
"keeplane": "KeepLane" "keeplane": "KeepLane"
}, },
......
...@@ -157,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase): ...@@ -157,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase):
if self._terminate_in_goal and self.goal_achieved: if self._terminate_in_goal and self.goal_achieved:
return True return True
return self.violation_happened and self._LTL_preconditions_enable return self.violation_happened
@property @property
def goal_achieved(self): def goal_achieved(self):
......
...@@ -93,7 +93,7 @@ vlanes = Route( ...@@ -93,7 +93,7 @@ vlanes = Route(
[-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35, [-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35,
[-4.0, 4.0]) [-4.0, 4.0])
intersection_width = vlanes.n_lanes * vlanes.width intersection_width = vlanes.n_lanes * vlanes.width
intersection_height = hlanes.n_lanes * hlanes.width intersection_height = hlanes.n_lanes * hlanes.width
intersection_width_w_offset = intersection_width + 2 * intersection_hoffset intersection_width_w_offset = intersection_width + 2 * intersection_hoffset
......
...@@ -51,7 +51,9 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): ...@@ -51,7 +51,9 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase):
#: The weight vector to calculate the cost. In the maneuver, cost_weights #: The weight vector to calculate the cost. In the maneuver, cost_weights
# can be set to a specific value which may be different than the default. # can be set to a specific value which may be different than the default.
cost_weights = (1.0, 0.25, 0.1, 1.0, 100.0, 0.1, 0.25, 0.1) # TODO: check _cost_weights in both here and ManeuverBase. The _cost_weights has to be substituted to here, but it doesn't sometimes.
_cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)
#TODO: Move this to constants #TODO: Move this to constants
# The empirical min and max of each term in the cost vector, which is used to normalize the values # The empirical min and max of each term in the cost vector, which is used to normalize the values
......
...@@ -180,11 +180,6 @@ class OptionsGraph: ...@@ -180,11 +180,6 @@ class OptionsGraph:
print("\n Warning: the trained low-level policy of \"" + key + print("\n Warning: the trained low-level policy of \"" + key +
"\" does not exists; the manual policy will be used.\n") "\" does not exists; the manual policy will be used.\n")
# setting the cost weights for high-level policy training.
# TODO: this shouldn't be initialized here, but within ManeuverBase class (e.g. make some flag indicating high-level training and set the weights)...
maneuver._cost_weights = (100.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
if self.config["method"] == "mcts": if self.config["method"] == "mcts":
maneuver.timeout = np.inf maneuver.timeout = np.inf
......
...@@ -30,7 +30,7 @@ class ManeuverBase(EpisodicEnvBase): ...@@ -30,7 +30,7 @@ class ManeuverBase(EpisodicEnvBase):
# as a negative reward, so a cost will be summed up to the reward # as a negative reward, so a cost will be summed up to the reward
# with subtraction. # with subtraction.
_cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3) 100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)
_extra_r_terminal = None _extra_r_terminal = None
_extra_r_on_timeout = None _extra_r_on_timeout = None
...@@ -38,7 +38,7 @@ class ManeuverBase(EpisodicEnvBase): ...@@ -38,7 +38,7 @@ class ManeuverBase(EpisodicEnvBase):
#: the flag being False when _cost_weights is used without #: the flag being False when _cost_weights is used without
# modification; If True, then the action parts of _cost_weights # modification; If True, then the action parts of _cost_weights
# are increased for some edge cases (see the step method). # are increased for some edge cases (see the step method).
_extra_action_weights_flag = True _extra_action_weights_flag = False
#: the extra weights on the actions added to _cost_weights #: the extra weights on the actions added to _cost_weights
# for some edge cases when _extra_action_weights_flag = True. # for some edge cases when _extra_action_weights_flag = True.
......
...@@ -15,11 +15,10 @@ class KeepLane(ManeuverBase): ...@@ -15,11 +15,10 @@ class KeepLane(ManeuverBase):
self._target_lane = self.env.ego.APs['lane'] self._target_lane = self.env.ego.APs['lane']
def _init_LTL_preconditions(self): def _init_LTL_preconditions(self):
self._LTL_preconditions.append(LTLProperty("G ( not veh_ahead )", 0))
#self._LTL_preconditions.append( self._LTL_preconditions.append(
# LTLProperty("G ( not stopped_now )", 100, LTLProperty("G ( not stopped_now )", 100,
# self._enable_low_level_training_properties)) self._enable_low_level_training_properties))
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty( LTLProperty(
...@@ -37,6 +36,7 @@ class KeepLane(ManeuverBase): ...@@ -37,6 +36,7 @@ class KeepLane(ManeuverBase):
self.env._terminate_in_goal = False self.env._terminate_in_goal = False
self.env._reward_in_goal = None self.env._reward_in_goal = None
self._enable_low_level_training_properties = True self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self): def generate_validation_scenario(self):
self.generate_scenario( self.generate_scenario(
...@@ -65,7 +65,7 @@ class KeepLane(ManeuverBase): ...@@ -65,7 +65,7 @@ class KeepLane(ManeuverBase):
return False return False
class ImmediateStop(ManeuverBase): class Halt(ManeuverBase):
_terminate_in_goal = True _terminate_in_goal = True
_reward_in_goal = None _reward_in_goal = None
...@@ -102,6 +102,7 @@ class ImmediateStop(ManeuverBase): ...@@ -102,6 +102,7 @@ class ImmediateStop(ManeuverBase):
self.env._reward_in_goal = None self.env._reward_in_goal = None
self._reward_in_goal = 200 self._reward_in_goal = 200
self._enable_low_level_training_properties = True self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self): def generate_validation_scenario(self):
self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos) self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos)
...@@ -147,14 +148,11 @@ class Stop(ManeuverBase): ...@@ -147,14 +148,11 @@ class Stop(ManeuverBase):
self._target_lane = self.env.ego.APs['lane'] self._target_lane = self.env.ego.APs['lane']
def _init_LTL_preconditions(self): def _init_LTL_preconditions(self):
#self._LTL_preconditions.append(
# LTLProperty("G ( not has_stopped_in_stop_region )",
# self._penalty(self._reward_in_goal)))
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty("G ( not has_stopped_in_stop_region )", -150, LTLProperty("G ( not has_stopped_in_stop_region )",
not self._enable_low_level_training_properties)) self._penalty(self._reward_in_goal), not self._enable_low_level_training_properties))
# before_intersection rather than "before_but_close_to_stop_region or in_stop_region"?
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty( LTLProperty(
"G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )", "G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )",
...@@ -191,6 +189,7 @@ class Stop(ManeuverBase): ...@@ -191,6 +189,7 @@ class Stop(ManeuverBase):
self._reward_in_goal = 200 self._reward_in_goal = 200
self._penalty_in_violation = 150 self._penalty_in_violation = 150
self._enable_low_level_training_properties = True self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def _low_level_manual_policy(self): def _low_level_manual_policy(self):
return self.env.aggressive_driving_policy(EGO_INDEX) return self.env.aggressive_driving_policy(EGO_INDEX)
...@@ -234,27 +233,21 @@ class Wait(ManeuverBase): ...@@ -234,27 +233,21 @@ class Wait(ManeuverBase):
def _init_LTL_preconditions(self): def _init_LTL_preconditions(self):
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty("G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))",
None, not self._enable_low_level_training_properties)) # not available in low-level training...
LTLProperty( self._LTL_preconditions.append(
"G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", 0, LTLProperty("G ( not (in_intersection and highest_priority) )",
not self._enable_low_level_training_properties)) # not available in low-level training... self._penalty(self._reward_in_goal)))
#LTLProperty(
# "G ( (in_stop_region and stopped_now) U (highest_priority and intersection_is_clear))", 0,
# not self._enable_low_level_training_properties)) # not available in low-level training...
#self._LTL_preconditions.append(
# LTLProperty("G ( not (in_intersection and highest_priority) )",
# self._penalty(self._reward_in_goal)))
#self._LTL_preconditions.append( self._LTL_preconditions.append(
# LTLProperty( LTLProperty(
# "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties)) "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))
#self._LTL_preconditions.append( self._LTL_preconditions.append(
# LTLProperty( LTLProperty(
# "G ( (lane and target_lane) or (not lane and not target_lane) )", "G ( (lane and target_lane) or (not lane and not target_lane) )",
# 100, self._enable_low_level_training_properties)) 100, self._enable_low_level_training_properties))
def _init_param(self): def _init_param(self):
self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit
...@@ -324,7 +317,7 @@ class ChangeLane(ManeuverBase): ...@@ -324,7 +317,7 @@ class ChangeLane(ManeuverBase):
_violation_penalty_in_low_level_training = None _violation_penalty_in_low_level_training = None
# high_level_extra_reward = -1000000 high_level_extra_reward = -50
def _init_param(self): def _init_param(self):
self._v_ref = rd.speed_limit self._v_ref = rd.speed_limit
...@@ -332,13 +325,17 @@ class ChangeLane(ManeuverBase): ...@@ -332,13 +325,17 @@ class ChangeLane(ManeuverBase):
def _init_LTL_preconditions(self): def _init_LTL_preconditions(self):
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty("G ( on_route and not over_speed_limit )", LTLProperty("G ( on_route and not over_speed_limit )",
self._violation_penalty_in_low_level_training, self._violation_penalty_in_low_level_training,
self._enable_low_level_training_properties)) self._enable_low_level_training_properties))
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty("G ( not stopped_now )", LTLProperty("G ( not stopped_now )",
100, self._enable_low_level_training_properties)) 100, self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not in_intersection and not in_stop_region )",
None, not self._enable_low_level_training_properties)) # activated only for the high-level case.
@property @property
def goal_achieved(self): def goal_achieved(self):
...@@ -346,9 +343,9 @@ class ChangeLane(ManeuverBase): ...@@ -346,9 +343,9 @@ class ChangeLane(ManeuverBase):
APs = self.env.ego.APs APs = self.env.ego.APs
on_other_lane = APs['lane'] == self._target_lane on_other_lane = APs['lane'] == self._target_lane
achieved_y_displacement = np.sign(ego.y) * \ achieved_y_displacement = np.sign(ego.y) * \
(ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
return on_other_lane and APs['on_route'] and \ return on_other_lane and APs['on_route'] and \
achieved_y_displacement and APs['parallel_to_lane'] achieved_y_displacement and APs['parallel_to_lane']
def _low_level_manual_policy(self): def _low_level_manual_policy(self):
return self.env.aggressive_driving_policy(EGO_INDEX) return self.env.aggressive_driving_policy(EGO_INDEX)
...@@ -366,6 +363,7 @@ class ChangeLane(ManeuverBase): ...@@ -366,6 +363,7 @@ class ChangeLane(ManeuverBase):
self._reward_in_goal = 200 self._reward_in_goal = 200
self._violation_penalty_in_low_level_training = 150 self._violation_penalty_in_low_level_training = 150
self._enable_low_level_training_properties = True self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self): def generate_validation_scenario(self):
self.generate_scenario( self.generate_scenario(
...@@ -396,20 +394,17 @@ class Follow(ManeuverBase): ...@@ -396,20 +394,17 @@ class Follow(ManeuverBase):
LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range)) LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range))
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty( LTLProperty(
"G ( (lane and target_lane) or (not lane and not target_lane) )", "G ( (lane and target_lane) or (not lane and not target_lane) )",
self._penalty_for_change_lane)) self._penalty_for_change_lane))
#self._LTL_preconditions.append( # self._LTL_preconditions.append(
# LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200, # LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200,
# self._enable_low_level_training_properties)) # self._enable_low_level_training_properties))
self._LTL_preconditions.append( self._LTL_preconditions.append(
LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range, LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
self._enable_low_level_training_properties)) self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not in_stop_region)", 0, not self._enable_low_level_training_properties))
def generate_learning_scenario(self): def generate_learning_scenario(self):
self.generate_scenario( self.generate_scenario(
...@@ -422,6 +417,7 @@ class Follow(ManeuverBase): ...@@ -422,6 +417,7 @@ class Follow(ManeuverBase):
self._penalty_for_out_of_range = 200 self._penalty_for_out_of_range = 200
self._penalty_for_change_lane = 170 self._penalty_for_change_lane = 170
self._enable_low_level_training_properties = True self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self): def generate_validation_scenario(self):
self.generate_learning_scenario() self.generate_learning_scenario()
...@@ -434,12 +430,12 @@ class Follow(ManeuverBase): ...@@ -434,12 +430,12 @@ class Follow(ManeuverBase):
def _set_v_ref(self): def _set_v_ref(self):
#if self._enable_low_level_training_properties: #if self._enable_low_level_training_properties:
self._target_veh_i, _ = self.env.get_V2V_distance() self._target_veh_i, _ = self.env.get_V2V_distance()
if self._target_veh_i is not None: if self._target_veh_i is not None:
self._v_ref = self.env.vehs[self._target_veh_i].v self._v_ref = self.env.vehs[self._target_veh_i].v
else: else:
self._v_ref = 0 self._v_ref = 0
#else: #else:
# self._v_ref = rd.speed_limit # self._v_ref = rd.speed_limit
...@@ -448,19 +444,17 @@ class Follow(ManeuverBase): ...@@ -448,19 +444,17 @@ class Follow(ManeuverBase):
@property @property
def extra_termination_condition(self): def extra_termination_condition(self):
APs = self.env.ego.APs # APs = self.env.ego.APs
if self._target_veh_i is None: if self._target_veh_i is None:
return False return False
elif not self._enable_low_level_training_properties: # activated only for the high-level training. #elif not self._enable_low_level_training_properties: # activated only for the high-level training.
if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \ # if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or # and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
self.env.vehs[self._target_veh_i].x > 0): # self.env.vehs[self._target_veh_i].x > 0):
return True # return True
else: # else:
return False
return False return False
def _features_dim_reduction(self, features_tuple): def _features_dim_reduction(self, features_tuple):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment