Skip to content
Snippets Groups Projects
Commit 7dc47bce authored by Jae Young Lee's avatar Jae Young Lee
Browse files

Fix the bug of having -100000 or 10000 rewards sometime.

parent 4a9327bd
No related branches found
No related tags found
No related merge requests found
......@@ -65,7 +65,7 @@ class ControllerBase(PolicyBase):
total_reward += self.current_node.high_level_extra_reward
# TODO for info
return observation, total_reward, self.env.termination_condition, info
return observation, total_reward, terminal, info
# TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition
......@@ -76,9 +76,7 @@ class ControllerBase(PolicyBase):
def low_level_step_current_node(self):
u_ego = self.current_node.low_level_policy(
self.current_node.get_reduced_features_tuple())
u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple())
feature, R, terminal, info = self.current_node.step(u_ego)
self.node_terminal_state_reached = terminal
return self.env.get_features_tuple(
), R, self.env.termination_condition, info
return self.env.get_features_tuple(), R, self.env.termination_condition, info
......@@ -317,6 +317,7 @@ class DQNLearner(LearnerBase):
nb_steps_warmup=self.nb_steps_warmup,
target_model_update=self.target_model_update,
policy=policy,
test_policy=test_policy,
enable_dueling_network=True)
agent.compile(Adam(lr=self.lr), metrics=['mae'])
......@@ -327,6 +328,8 @@ class DQNLearner(LearnerBase):
env,
nb_steps=1000000,
visualize=False,
verbose=1,
log_interval=10000,
nb_max_episode_steps=200,
tensorboard=False,
model_checkpoints=False,
......@@ -346,7 +349,8 @@ class DQNLearner(LearnerBase):
env,
nb_steps=nb_steps,
visualize=visualize,
verbose=1,
verbose=verbose,
log_interval=log_interval,
nb_max_episode_steps=nb_max_episode_steps,
callbacks=callbacks)
......@@ -479,21 +483,13 @@ class RestrictedGreedyQPolicy(GreedyQPolicy):
Selection action
"""
assert q_values.ndim == 1
nb_actions = q_values.shape[0]
restricted_q_values = list()
for i in range(0, nb_actions):
if q_values[i] != -np.inf:
restricted_q_values.append(q_values[i])
# every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions
# of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function.
# In this case, we choose a policy randomly.
if len(restricted_q_values) < 1:
action = np.random.random_integers(0, nb_actions - 1)
if np.max(q_values) == - np.inf:
# every q_value is -np.inf, we choose action = 0
action = 0
else:
action = np.argmax(restricted_q_values)
action = np.argmax(q_values)
return action
......@@ -521,6 +517,7 @@ class DQNAgentOverOptions(DQNAgent):
def __get_invalid_node_indices(self):
"""Returns a list of option indices that are invalid according to
initiation conditions."""
invalid_node_indices = list()
for index, option_alias in enumerate(self.low_level_policy_aliases):
# TODO: Locate reset_maneuver to another place as this is a "get" function.
......
......@@ -3,7 +3,7 @@
"wait": "Wait",
"follow": "Follow",
"stop": "Stop",
"immediatestop": "ImmediateStop",
"changelane": "ChangeLane",
"keeplane": "KeepLane"
},
......
......@@ -157,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase):
if self._terminate_in_goal and self.goal_achieved:
return True
return self.violation_happened and self._LTL_preconditions_enable
return self.violation_happened
@property
def goal_achieved(self):
......
......@@ -93,7 +93,7 @@ vlanes = Route(
[-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35,
[-4.0, 4.0])
intersection_width = vlanes.n_lanes * vlanes.width
intersection_width = vlanes.n_lanes * vlanes.width
intersection_height = hlanes.n_lanes * hlanes.width
intersection_width_w_offset = intersection_width + 2 * intersection_hoffset
......
......@@ -51,7 +51,9 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase):
#: The weight vector to calculate the cost. In the maneuver, cost_weights
# can be set to a specific value which may be different than the default.
cost_weights = (1.0, 0.25, 0.1, 1.0, 100.0, 0.1, 0.25, 0.1)
# TODO: check _cost_weights in both here and ManeuverBase. The _cost_weights has to be substituted to here, but it doesn't sometimes.
_cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)
#TODO: Move this to constants
# The empirical min and max of each term in the cost vector, which is used to normalize the values
......
......@@ -180,11 +180,6 @@ class OptionsGraph:
print("\n Warning: the trained low-level policy of \"" + key +
"\" does not exists; the manual policy will be used.\n")
# setting the cost weights for high-level policy training.
# TODO: this shouldn't be initialized here, but within ManeuverBase class (e.g. make some flag indicating high-level training and set the weights)...
maneuver._cost_weights = (100.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
if self.config["method"] == "mcts":
maneuver.timeout = np.inf
......
......@@ -30,7 +30,7 @@ class ManeuverBase(EpisodicEnvBase):
# as a negative reward, so a cost will be summed up to the reward
# with subtraction.
_cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3,
100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3)
100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3)
_extra_r_terminal = None
_extra_r_on_timeout = None
......@@ -38,7 +38,7 @@ class ManeuverBase(EpisodicEnvBase):
#: the flag being False when _cost_weights is used without
# modification; If True, then the action parts of _cost_weights
# are increased for some edge cases (see the step method).
_extra_action_weights_flag = True
_extra_action_weights_flag = False
#: the extra weights on the actions added to _cost_weights
# for some edge cases when _extra_action_weights_flag = True.
......
......@@ -15,11 +15,10 @@ class KeepLane(ManeuverBase):
self._target_lane = self.env.ego.APs['lane']
def _init_LTL_preconditions(self):
self._LTL_preconditions.append(LTLProperty("G ( not veh_ahead )", 0))
#self._LTL_preconditions.append(
# LTLProperty("G ( not stopped_now )", 100,
# self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not stopped_now )", 100,
self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty(
......@@ -37,6 +36,7 @@ class KeepLane(ManeuverBase):
self.env._terminate_in_goal = False
self.env._reward_in_goal = None
self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self):
self.generate_scenario(
......@@ -65,7 +65,7 @@ class KeepLane(ManeuverBase):
return False
class ImmediateStop(ManeuverBase):
class Halt(ManeuverBase):
_terminate_in_goal = True
_reward_in_goal = None
......@@ -102,6 +102,7 @@ class ImmediateStop(ManeuverBase):
self.env._reward_in_goal = None
self._reward_in_goal = 200
self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self):
self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos)
......@@ -147,14 +148,11 @@ class Stop(ManeuverBase):
self._target_lane = self.env.ego.APs['lane']
def _init_LTL_preconditions(self):
#self._LTL_preconditions.append(
# LTLProperty("G ( not has_stopped_in_stop_region )",
# self._penalty(self._reward_in_goal)))
self._LTL_preconditions.append(
LTLProperty("G ( not has_stopped_in_stop_region )", -150,
not self._enable_low_level_training_properties))
LTLProperty("G ( not has_stopped_in_stop_region )",
self._penalty(self._reward_in_goal), not self._enable_low_level_training_properties))
# before_intersection rather than "before_but_close_to_stop_region or in_stop_region"?
self._LTL_preconditions.append(
LTLProperty(
"G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )",
......@@ -191,6 +189,7 @@ class Stop(ManeuverBase):
self._reward_in_goal = 200
self._penalty_in_violation = 150
self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def _low_level_manual_policy(self):
return self.env.aggressive_driving_policy(EGO_INDEX)
......@@ -234,27 +233,21 @@ class Wait(ManeuverBase):
def _init_LTL_preconditions(self):
self._LTL_preconditions.append(
LTLProperty("G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))",
None, not self._enable_low_level_training_properties)) # not available in low-level training...
LTLProperty(
"G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", 0,
not self._enable_low_level_training_properties)) # not available in low-level training...
#LTLProperty(
# "G ( (in_stop_region and stopped_now) U (highest_priority and intersection_is_clear))", 0,
# not self._enable_low_level_training_properties)) # not available in low-level training...
#self._LTL_preconditions.append(
# LTLProperty("G ( not (in_intersection and highest_priority) )",
# self._penalty(self._reward_in_goal)))
self._LTL_preconditions.append(
LTLProperty("G ( not (in_intersection and highest_priority) )",
self._penalty(self._reward_in_goal)))
#self._LTL_preconditions.append(
# LTLProperty(
# "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty(
"G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties))
#self._LTL_preconditions.append(
# LTLProperty(
# "G ( (lane and target_lane) or (not lane and not target_lane) )",
# 100, self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty(
"G ( (lane and target_lane) or (not lane and not target_lane) )",
100, self._enable_low_level_training_properties))
def _init_param(self):
self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit
......@@ -324,7 +317,7 @@ class ChangeLane(ManeuverBase):
_violation_penalty_in_low_level_training = None
# high_level_extra_reward = -1000000
high_level_extra_reward = -50
def _init_param(self):
self._v_ref = rd.speed_limit
......@@ -332,13 +325,17 @@ class ChangeLane(ManeuverBase):
def _init_LTL_preconditions(self):
self._LTL_preconditions.append(
LTLProperty("G ( on_route and not over_speed_limit )",
self._violation_penalty_in_low_level_training,
self._enable_low_level_training_properties))
LTLProperty("G ( on_route and not over_speed_limit )",
self._violation_penalty_in_low_level_training,
self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not stopped_now )",
100, self._enable_low_level_training_properties))
LTLProperty("G ( not stopped_now )",
100, self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not in_intersection and not in_stop_region )",
None, not self._enable_low_level_training_properties)) # activated only for the high-level case.
@property
def goal_achieved(self):
......@@ -346,9 +343,9 @@ class ChangeLane(ManeuverBase):
APs = self.env.ego.APs
on_other_lane = APs['lane'] == self._target_lane
achieved_y_displacement = np.sign(ego.y) * \
(ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
(ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance
return on_other_lane and APs['on_route'] and \
achieved_y_displacement and APs['parallel_to_lane']
achieved_y_displacement and APs['parallel_to_lane']
def _low_level_manual_policy(self):
return self.env.aggressive_driving_policy(EGO_INDEX)
......@@ -366,6 +363,7 @@ class ChangeLane(ManeuverBase):
self._reward_in_goal = 200
self._violation_penalty_in_low_level_training = 150
self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self):
self.generate_scenario(
......@@ -396,20 +394,17 @@ class Follow(ManeuverBase):
LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range))
self._LTL_preconditions.append(
LTLProperty(
"G ( (lane and target_lane) or (not lane and not target_lane) )",
self._penalty_for_change_lane))
LTLProperty(
"G ( (lane and target_lane) or (not lane and not target_lane) )",
self._penalty_for_change_lane))
#self._LTL_preconditions.append(
# self._LTL_preconditions.append(
# LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200,
# self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
self._enable_low_level_training_properties))
self._LTL_preconditions.append(
LTLProperty("G ( not in_stop_region)", 0, not self._enable_low_level_training_properties))
LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range,
self._enable_low_level_training_properties))
def generate_learning_scenario(self):
self.generate_scenario(
......@@ -422,6 +417,7 @@ class Follow(ManeuverBase):
self._penalty_for_out_of_range = 200
self._penalty_for_change_lane = 170
self._enable_low_level_training_properties = True
self._extra_action_weights_flag = True
def generate_validation_scenario(self):
self.generate_learning_scenario()
......@@ -434,12 +430,12 @@ class Follow(ManeuverBase):
def _set_v_ref(self):
#if self._enable_low_level_training_properties:
self._target_veh_i, _ = self.env.get_V2V_distance()
self._target_veh_i, _ = self.env.get_V2V_distance()
if self._target_veh_i is not None:
self._v_ref = self.env.vehs[self._target_veh_i].v
else:
self._v_ref = 0
if self._target_veh_i is not None:
self._v_ref = self.env.vehs[self._target_veh_i].v
else:
self._v_ref = 0
#else:
# self._v_ref = rd.speed_limit
......@@ -448,19 +444,17 @@ class Follow(ManeuverBase):
@property
def extra_termination_condition(self):
APs = self.env.ego.APs
# APs = self.env.ego.APs
if self._target_veh_i is None:
return False
elif not self._enable_low_level_training_properties: # activated only for the high-level training.
if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
self.env.vehs[self._target_veh_i].x > 0):
return True
else:
return False
#elif not self._enable_low_level_training_properties: # activated only for the high-level training.
# if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \
# and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or
# self.env.vehs[self._target_veh_i].x > 0):
# return True
# else:
return False
def _features_dim_reduction(self, features_tuple):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment