diff --git a/backends/controller_base.py b/backends/controller_base.py index 885f941c680612f6efbbdc94df26963e83bdefaf..ad3185664f1eab6d506a84117af20de75cbad57c 100644 --- a/backends/controller_base.py +++ b/backends/controller_base.py @@ -51,21 +51,21 @@ class ControllerBase(PolicyBase): Returns state at end of node execution, total reward, epsiode_termination_flag, info ''' - + # TODO: this is never called when you test high-level policy rather than train... def step_current_node(self, visualize_low_level_steps=False): total_reward = 0 self.node_terminal_state_reached = False while not self.node_terminal_state_reached: - observation, reward, terminal, info = self.low_level_step_current_node( - ) + observation, reward, terminal, info = self.low_level_step_current_node() if visualize_low_level_steps: self.env.render() + # TODO: make the total_reward discounted.... total_reward += reward total_reward += self.current_node.high_level_extra_reward # TODO for info - return observation, total_reward, self.env.termination_condition, info + return observation, total_reward, terminal, info # TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean ''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition @@ -76,9 +76,7 @@ class ControllerBase(PolicyBase): def low_level_step_current_node(self): - u_ego = self.current_node.low_level_policy( - self.current_node.get_reduced_features_tuple()) + u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple()) feature, R, terminal, info = self.current_node.step(u_ego) self.node_terminal_state_reached = terminal - return self.env.get_features_tuple( - ), R, self.env.termination_condition, info + return self.env.get_features_tuple(), R, self.env.termination_condition, info diff --git a/backends/kerasrl_learner.py b/backends/kerasrl_learner.py index a98dc928bec3d63e0a3975f85144e98671710b95..0da901bd6f80a145b2d8980110a85105017af0f4 100644 --- a/backends/kerasrl_learner.py +++ b/backends/kerasrl_learner.py @@ -8,7 +8,7 @@ from keras.callbacks import TensorBoard from rl.agents import DDPGAgent, DQNAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess -from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy +from rl.policy import GreedyQPolicy, EpsGreedyQPolicy, MaxBoltzmannQPolicy from rl.callbacks import ModelIntervalCheckpoint @@ -229,6 +229,7 @@ class DQNLearner(LearnerBase): model=None, policy=None, memory=None, + test_policy=None, **kwargs): """The constructor which sets the properties of the class. @@ -236,8 +237,8 @@ class DQNLearner(LearnerBase): input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given - policy: KerasRL Policy. Uses default SequentialMemory if not given - memory: KerasRL Memory. Uses default BoltzmannQPolicy if not given + policy: KerasRL Policy. Uses default RestrictedEpsGreedyQPolicy if not given + memory: KerasRL Memory. Uses default SequentialMemory if not given **kwargs: other optional key-value arguments with defaults defined in property_defaults """ super(DQNLearner, self).__init__(input_shape, nb_actions, **kwargs) @@ -255,12 +256,14 @@ class DQNLearner(LearnerBase): model = self.get_default_model() if policy is None: policy = self.get_default_policy() + if test_policy is None: + test_policy = self.get_default_test_policy() if memory is None: memory = self.get_default_memory() self.low_level_policies = low_level_policies - self.agent_model = self.create_agent(model, policy, memory) + self.agent_model = self.create_agent(model, policy, memory, test_policy) def get_default_model(self): """Creates the default model. @@ -269,9 +272,11 @@ class DQNLearner(LearnerBase): """ model = Sequential() model.add(Flatten(input_shape=(1, ) + self.input_shape)) - model.add(Dense(32)) + model.add(Dense(64)) + model.add(Activation('relu')) + model.add(Dense(64)) model.add(Activation('relu')) - model.add(Dense(32)) + model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(self.nb_actions)) model.add(Activation('linear')) @@ -280,7 +285,10 @@ class DQNLearner(LearnerBase): return model def get_default_policy(self): - return MaxBoltzmannQPolicy(eps=0.3) + return RestrictedEpsGreedyQPolicy(0.3) + + def get_default_test_policy(self): + return RestrictedGreedyQPolicy() def get_default_memory(self): """Creates the default memory model. @@ -291,7 +299,7 @@ class DQNLearner(LearnerBase): limit=self.mem_size, window_length=self.mem_window_length) return memory - def create_agent(self, model, policy, memory): + def create_agent(self, model, policy, memory, test_policy): """Creates a KerasRL DDPGAgent with given components. Args: @@ -309,6 +317,7 @@ class DQNLearner(LearnerBase): nb_steps_warmup=self.nb_steps_warmup, target_model_update=self.target_model_update, policy=policy, + test_policy=test_policy, enable_dueling_network=True) agent.compile(Adam(lr=self.lr), metrics=['mae']) @@ -319,6 +328,8 @@ class DQNLearner(LearnerBase): env, nb_steps=1000000, visualize=False, + verbose=1, + log_interval=10000, nb_max_episode_steps=200, tensorboard=False, model_checkpoints=False, @@ -338,7 +349,8 @@ class DQNLearner(LearnerBase): env, nb_steps=nb_steps, visualize=visualize, - verbose=1, + verbose=verbose, + log_interval=log_interval, nb_max_episode_steps=nb_max_episode_steps, callbacks=callbacks) @@ -410,6 +422,82 @@ class DQNLearner(LearnerBase): return relevant +class RestrictedEpsGreedyQPolicy(EpsGreedyQPolicy): + """Implement the epsilon greedy policy + + Restricted Eps Greedy policy. + This policy ensures that it never chooses the action whose value is -inf + + """ + + def __init__(self, eps=.1): + super(RestrictedEpsGreedyQPolicy, self).__init__(eps) + + def select_action(self, q_values): + """Return the selected action + + # Arguments + q_values (np.ndarray): List of the estimations of Q for each action + + # Returns + Selection action + """ + assert q_values.ndim == 1 + nb_actions = q_values.shape[0] + index = list() + + for i in range(0, nb_actions): + if q_values[i] != -np.inf: + index.append(i) + + # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions + # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. + # TODO: exception process or some more process to choose action in this exceptional case. + if len(index) < 1: + # every q_value is -np.inf, we choose action = 0 + action = 0 + print("Warning: no action satisfies initiation condition, action = 0 is chosen by default.") + + elif np.random.uniform() <= self.eps: + action = index[np.random.random_integers(0, len(index) - 1)] + + else: + action = np.argmax(q_values) + + return action + + +class RestrictedGreedyQPolicy(GreedyQPolicy): + """Implement the epsilon greedy policy + + Restricted Greedy policy. + This policy ensures that it never chooses the action whose value is -inf + + """ + + def select_action(self, q_values): + """Return the selected action + + # Arguments + q_values (np.ndarray): List of the estimations of Q for each action + + # Returns + Selection action + """ + assert q_values.ndim == 1 + + # TODO: exception process or some more process to choose action in this exceptional case. + if np.max(q_values) == - np.inf: + # every q_value is -np.inf, we choose action = 0 + action = 0 + print("Warning: no action satisfies initiation condition, action = 0 is chosen by default.") + + else: + action = np.argmax(q_values) + + return action + + class DQNAgentOverOptions(DQNAgent): def __init__(self, model, @@ -433,8 +521,10 @@ class DQNAgentOverOptions(DQNAgent): def __get_invalid_node_indices(self): """Returns a list of option indices that are invalid according to initiation conditions.""" + invalid_node_indices = list() for index, option_alias in enumerate(self.low_level_policy_aliases): + # TODO: Locate reset_maneuver to another place as this is a "get" function. self.low_level_policies[option_alias].reset_maneuver() if not self.low_level_policies[option_alias].initiation_condition: invalid_node_indices.append(index) diff --git a/backends/manual_policy.py b/backends/manual_policy.py index 6e59db13b580d3762507f6fe286431cbb7b02ceb..3876b1d97dd2754309689d9b229d6fd6cb7ab7a6 100644 --- a/backends/manual_policy.py +++ b/backends/manual_policy.py @@ -30,6 +30,7 @@ class ManualPolicy(ControllerBase): new_node = None if self.low_level_policies[self.current_node].termination_condition: for next_node in self.adj[self.current_node]: + self.low_level_policies[next_node].reset_maneuver() if self.low_level_policies[next_node].initiation_condition: new_node = next_node break # change current_node to the highest priority next node diff --git a/backends/trained_policies/0.1mil_weights/changelane_weights_actor.h5f b/backends/trained_policies/0.1mil_weights/changelane_weights_actor.h5f deleted file mode 100644 index 739298c51cfa43433ff21b4962cbba17170ee7a9..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/changelane_weights_actor.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/changelane_weights_critic.h5f b/backends/trained_policies/0.1mil_weights/changelane_weights_critic.h5f deleted file mode 100644 index 681b967fa6dbd6a2aaf46413891ca7f9229efeeb..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/changelane_weights_critic.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/follow_weights_actor.h5f b/backends/trained_policies/0.1mil_weights/follow_weights_actor.h5f deleted file mode 100644 index fa417a1dc40b3f5e953e687315f960f2b1ad3e92..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/follow_weights_actor.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/follow_weights_critic.h5f b/backends/trained_policies/0.1mil_weights/follow_weights_critic.h5f deleted file mode 100644 index 0105de348166a4cbaa29998174f5e89e82b3b86e..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/follow_weights_critic.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/keeplane_weights_actor.h5f b/backends/trained_policies/0.1mil_weights/keeplane_weights_actor.h5f deleted file mode 100644 index 639fdd5b1889a604f6a61b4ad11b18ee9f703ea1..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/keeplane_weights_actor.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/keeplane_weights_critic.h5f b/backends/trained_policies/0.1mil_weights/keeplane_weights_critic.h5f deleted file mode 100644 index bc791095159a0f5656381a1c2458bb5cab227c4c..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/keeplane_weights_critic.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/stop_weights_actor.h5f b/backends/trained_policies/0.1mil_weights/stop_weights_actor.h5f deleted file mode 100644 index ee1d2af63e8812fc52a977d1db63342ffb5b0148..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/stop_weights_actor.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/stop_weights_critic.h5f b/backends/trained_policies/0.1mil_weights/stop_weights_critic.h5f deleted file mode 100644 index ad2e374269254b3fe6151f484ba04b5b9c29af91..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/stop_weights_critic.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/wait_weights_actor.h5f b/backends/trained_policies/0.1mil_weights/wait_weights_actor.h5f deleted file mode 100644 index a861c76ca25171a7644cb2fe3f75772fe7c5d6ae..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/wait_weights_actor.h5f and /dev/null differ diff --git a/backends/trained_policies/0.1mil_weights/wait_weights_critic.h5f b/backends/trained_policies/0.1mil_weights/wait_weights_critic.h5f deleted file mode 100644 index b20d0e3209408f46f31f2c46e6b6b51ce23b2b65..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/0.1mil_weights/wait_weights_critic.h5f and /dev/null differ diff --git a/backends/trained_policies/changelane/changelane_weights_actor.h5f b/backends/trained_policies/changelane/changelane_weights_actor.h5f index 0a7ae9e96102f399e64dc83c4cc77e9e3b7a44cd..3f6e76aa921b07e851ab4eac8045e74c0c28e848 100644 Binary files a/backends/trained_policies/changelane/changelane_weights_actor.h5f and b/backends/trained_policies/changelane/changelane_weights_actor.h5f differ diff --git a/backends/trained_policies/changelane/changelane_weights_critic.h5f b/backends/trained_policies/changelane/changelane_weights_critic.h5f index 40ca3a83088d8984002bc0a4546020174e9cd431..b53dfd8f69aa96544b986a2de8c506b20162358f 100644 Binary files a/backends/trained_policies/changelane/changelane_weights_critic.h5f and b/backends/trained_policies/changelane/changelane_weights_critic.h5f differ diff --git a/backends/trained_policies/follow/follow_weights_actor.h5f b/backends/trained_policies/follow/follow_weights_actor.h5f index bcb0af7c5012216d05eb966550600c28a3bc322c..0f4bf96684f16bf8fc8db62dd56a301f5a96b62f 100644 Binary files a/backends/trained_policies/follow/follow_weights_actor.h5f and b/backends/trained_policies/follow/follow_weights_actor.h5f differ diff --git a/backends/trained_policies/follow/follow_weights_critic.h5f b/backends/trained_policies/follow/follow_weights_critic.h5f index f6258b53c8589db6a04382245c31f20d7f0915dc..2f51be39f37e0956887e930d969caf9ea421e0c2 100644 Binary files a/backends/trained_policies/follow/follow_weights_critic.h5f and b/backends/trained_policies/follow/follow_weights_critic.h5f differ diff --git a/backends/trained_policies/halt/immediatestop_weights_actor.h5f b/backends/trained_policies/halt/immediatestop_weights_actor.h5f new file mode 100644 index 0000000000000000000000000000000000000000..3ade0ae38ec13eb1040c1a4dc8a91390d9ed6752 Binary files /dev/null and b/backends/trained_policies/halt/immediatestop_weights_actor.h5f differ diff --git a/backends/trained_policies/halt/immediatestop_weights_critic.h5f b/backends/trained_policies/halt/immediatestop_weights_critic.h5f new file mode 100644 index 0000000000000000000000000000000000000000..2b97e53475d094a51c29cc8ed82f5bdc47f9ea3f Binary files /dev/null and b/backends/trained_policies/halt/immediatestop_weights_critic.h5f differ diff --git a/backends/trained_policies/highlevel/highlevel_weights.h5f b/backends/trained_policies/highlevel/highlevel_weights.h5f index 6dcc7ba4718d82e329f833ff23a8c68595309c61..b3c5347197f637d893b80c7258a6504d4368c59b 100644 Binary files a/backends/trained_policies/highlevel/highlevel_weights.h5f and b/backends/trained_policies/highlevel/highlevel_weights.h5f differ diff --git a/backends/trained_policies/highlevel/highlevel_weights_772.h5f b/backends/trained_policies/highlevel/highlevel_weights_772.h5f deleted file mode 100644 index 7b986c74005a62b1dc11cbdd3022105ec5317d37..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/highlevel/highlevel_weights_772.h5f and /dev/null differ diff --git a/backends/trained_policies/keeplane/keeplane_weights_actor.h5f b/backends/trained_policies/keeplane/keeplane_weights_actor.h5f index d9224714ec7717274033cb3f778800e6cacb690f..8efaf8b74e3fc805bbb373c3e24d3584df7ccf0c 100644 Binary files a/backends/trained_policies/keeplane/keeplane_weights_actor.h5f and b/backends/trained_policies/keeplane/keeplane_weights_actor.h5f differ diff --git a/backends/trained_policies/keeplane/keeplane_weights_critic.h5f b/backends/trained_policies/keeplane/keeplane_weights_critic.h5f index e5b4bf77af956cdae29a45518d768738fe647b6c..d7a70181897d2632e8c43446ee15c6dde69f294c 100644 Binary files a/backends/trained_policies/keeplane/keeplane_weights_critic.h5f and b/backends/trained_policies/keeplane/keeplane_weights_critic.h5f differ diff --git a/backends/trained_policies/left/left_weights_actor.h5f b/backends/trained_policies/left/left_weights_actor.h5f new file mode 100644 index 0000000000000000000000000000000000000000..3f6e76aa921b07e851ab4eac8045e74c0c28e848 Binary files /dev/null and b/backends/trained_policies/left/left_weights_actor.h5f differ diff --git a/backends/trained_policies/left/left_weights_critic.h5f b/backends/trained_policies/left/left_weights_critic.h5f new file mode 100644 index 0000000000000000000000000000000000000000..b53dfd8f69aa96544b986a2de8c506b20162358f Binary files /dev/null and b/backends/trained_policies/left/left_weights_critic.h5f differ diff --git a/backends/trained_policies/right/right_weights_actor.h5f b/backends/trained_policies/right/right_weights_actor.h5f new file mode 100644 index 0000000000000000000000000000000000000000..3f6e76aa921b07e851ab4eac8045e74c0c28e848 Binary files /dev/null and b/backends/trained_policies/right/right_weights_actor.h5f differ diff --git a/backends/trained_policies/right/right_weights_critic.h5f b/backends/trained_policies/right/right_weights_critic.h5f new file mode 100644 index 0000000000000000000000000000000000000000..b53dfd8f69aa96544b986a2de8c506b20162358f Binary files /dev/null and b/backends/trained_policies/right/right_weights_critic.h5f differ diff --git a/backends/trained_policies/stop/stop_weights_actor.h5f b/backends/trained_policies/stop/stop_weights_actor.h5f index 7ef9b12a24464c9c9c4d0ca0bd625b6d1875efb9..1f89ca2104f5801eb243b0898c3f050f91761f77 100644 Binary files a/backends/trained_policies/stop/stop_weights_actor.h5f and b/backends/trained_policies/stop/stop_weights_actor.h5f differ diff --git a/backends/trained_policies/stop/stop_weights_critic.h5f b/backends/trained_policies/stop/stop_weights_critic.h5f index 41a357a5e68aac591dea24edb47f5339960c0c90..12d7d202761bfafba4f6e9d564c154926bbc72a8 100644 Binary files a/backends/trained_policies/stop/stop_weights_critic.h5f and b/backends/trained_policies/stop/stop_weights_critic.h5f differ diff --git a/backends/trained_policies/wait/wait_weights_actor.h5f b/backends/trained_policies/wait/wait_weights_actor.h5f deleted file mode 100644 index 00703d9cc1931bc31083f9a77575598a030f92c3..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/wait/wait_weights_actor.h5f and /dev/null differ diff --git a/backends/trained_policies/wait/wait_weights_critic.h5f b/backends/trained_policies/wait/wait_weights_critic.h5f deleted file mode 100644 index 330e90c08e129f18a05e2a54882f8bb9fffc1957..0000000000000000000000000000000000000000 Binary files a/backends/trained_policies/wait/wait_weights_critic.h5f and /dev/null differ diff --git a/env/env_base.py b/env/env_base.py index 4a8b9a50d65d9113ec2d1c5f555ffd88d2e44050..038d59a6815566325a0c7716923d9425090897af 100644 --- a/env/env_base.py +++ b/env/env_base.py @@ -25,7 +25,8 @@ class EpisodicEnvBase(GymCompliantEnvBase): # three types possible ('min', 'max', or 'sum'); # See _reward_superposition below. - terminal_reward_type = 'max' + # TODO: consider the case, where every terminal reward is None. Make this class have a default terminal value (not None) and use it in this case. + terminal_reward_type = 'min' #: If true, the maneuver terminates when the goal has been achieved. _terminate_in_goal = False @@ -140,13 +141,11 @@ class EpisodicEnvBase(GymCompliantEnvBase): def _reset_model_checker(self, AP): - self.__mc_AP = int(AP) - if self._LTL_preconditions_enable: for LTL_precondition in self._LTL_preconditions: LTL_precondition.reset_property() - if LTL_precondition.enabled: - LTL_precondition.check_incremental(self.__mc_AP) + + self._incremental_model_checking(AP) def _set_mc_AP(self, AP): self.__mc_AP = int(AP) @@ -158,7 +157,7 @@ class EpisodicEnvBase(GymCompliantEnvBase): if self._terminate_in_goal and self.goal_achieved: return True - return self.violation_happened and self._LTL_preconditions_enable + return self.violation_happened @property def goal_achieved(self): @@ -176,8 +175,8 @@ class EpisodicEnvBase(GymCompliantEnvBase): if not self._LTL_preconditions_enable: return False - for LTL_precondition in self._LTL_preconditions: - if LTL_precondition.result == Parser.FALSE: + for LTL in self._LTL_preconditions: + if LTL.enabled and (LTL.result == Parser.FALSE): return True return False diff --git a/env/simple_intersection/features.py b/env/simple_intersection/features.py index 2b70e51fa7357378e061ea599566ae0fea1dc5b2..2c422c6a2ffa97d16d47dd118306e8d289db2789 100644 --- a/env/simple_intersection/features.py +++ b/env/simple_intersection/features.py @@ -174,6 +174,6 @@ class Features(object): # Add buffer features to make a fixed length feature vector for i in range(MAX_NUM_VEHICLES - len(self.other_vehs)): - feature += (0.0, 0.0, 0.0, 0.0, -1) + feature += (0.0, 0.0, 0.0, 0.0, -1.0) return feature diff --git a/env/simple_intersection/road_geokinemetry.py b/env/simple_intersection/road_geokinemetry.py index 63190be6e9982cc9d14bc532bdbda0a7a666e3d0..602b3e5f9c800f211ad4ed20ec1ab9b952e8ff6f 100644 --- a/env/simple_intersection/road_geokinemetry.py +++ b/env/simple_intersection/road_geokinemetry.py @@ -93,7 +93,7 @@ vlanes = Route( [-vwidth - 5.0 - intersection_voffset, -vwidth - intersection_voffset], 35, [-4.0, 4.0]) -intersection_width = vlanes.n_lanes * vlanes.width +intersection_width = vlanes.n_lanes * vlanes.width intersection_height = hlanes.n_lanes * hlanes.width intersection_width_w_offset = intersection_width + 2 * intersection_hoffset diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py index 1d61b91ad2a9300a4cbf657bbaf1a8c80dd9347e..bd344831411192ebbc4a58fe9e6c71a5360a17ca 100644 --- a/env/simple_intersection/simple_intersection_env.py +++ b/env/simple_intersection/simple_intersection_env.py @@ -51,7 +51,10 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): #: The weight vector to calculate the cost. In the maneuver, cost_weights # can be set to a specific value which may be different than the default. - cost_weights = (1.0, 0.25, 0.1, 1.0, 100.0, 0.1, 0.25, 0.1) + # TODO: check _cost_weights in both here and ManeuverBase. The _cost_weights has to be substituted to here, but it doesn't sometimes. + # TODO: set a functionality of setting _cost_weights for low and high level training separately. + _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, + 100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3) #TODO: Move this to constants # The empirical min and max of each term in the cost vector, which is used to normalize the values @@ -271,7 +274,7 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): # stopped_car_scenario = bool(np.random.randint(0, 1)) TODO: this scenario may not work n_others_stopped_in_stop_region = np.random.randint( 0, min(3, n_others - stopped_car_scenario)) - veh_ahead_scenario = bool(np.random.randint(0, 1)) + veh_ahead_scenario = bool(np.random.randint(0, 2)) or veh_ahead_scenario if n_others_stopped_in_stop_region > min( n_others - stopped_car_scenario, 3): @@ -1156,12 +1159,13 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): Returns True if the environment has terminated """ - model_checks_violated = (self._LTL_preconditions_enable and \ - self.current_model_checking_result()) + model_checks_violated = self._LTL_preconditions_enable and \ + self.current_model_checking_result() reached_goal = self._terminate_in_goal and self.goal_achieved self._check_collisions() self._check_ego_theta_out_of_range() terminated = self.termination_condition + return model_checks_violated or reached_goal or terminated @property @@ -1181,7 +1185,7 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): return (self.ego.x >= rd.hlanes.end_pos) and \ not self.collision_happened and \ - not self.ego.APs['over_speed_limit'] + (self.ego.v <= 1.1*rd.speed_limit) def reset(self): """Gym compliant reset function. @@ -1229,7 +1233,6 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): self.window.dispatch_events() # Text information about ego vehicle's states - # Right now, we are only training one option (Stop) info = "Ego Attributes:" + get_APs( self, EGO_INDEX, 'in_stop_region', 'has_entered_stop_region', 'has_stopped_in_stop_region', diff --git a/high_level_policy_main.py b/high_level_policy_main.py index 3c5a0f3c787d7937618af5a76816372e5688a2d2..8204c7e2a3d641780a17d71723ee4bde7f6dec57 100644 --- a/high_level_policy_main.py +++ b/high_level_policy_main.py @@ -11,7 +11,7 @@ def high_level_policy_training(nb_steps=25000, load_weights=False, training=True, testing=True, - nb_episodes_for_test=10, + nb_episodes_for_test=20, max_nb_steps=100, visualize=False, tensorboard=False, @@ -63,8 +63,7 @@ def high_level_policy_training(nb_steps=25000, agent.save_model(save_path) if testing: - options.set_controller_policy(agent.predict) - agent.test_model(options, nb_episodes=nb_episodes_for_test) + high_level_policy_testing(nb_episodes_for_test=nb_episodes_for_test) return agent @@ -228,7 +227,6 @@ if __name__ == "__main__": load_weights=args.load_weights, save_path=args.save_file, tensorboard=args.tensorboard, - nb_episodes_for_test=20, visualize=args.visualize) if args.test: diff --git a/options/options_loader.py b/options/options_loader.py index f58554f80af5c0d85909776ab4251ddfe75b4cfb..f4f73869c67699775007b67c4175b724f57b3cbc 100644 --- a/options/options_loader.py +++ b/options/options_loader.py @@ -1,4 +1,5 @@ import json +import os # for the use of os.path.isfile from .simple_intersection.maneuvers import * from .simple_intersection.mcts_maneuvers import * from backends import RLController, DDPGLearner, MCTSLearner, OnlineMCTSController, ManualPolicy @@ -155,19 +156,29 @@ class OptionsGraph: # TODO: error handling def load_trained_low_level_policies(self): for key, maneuver in self.maneuvers.items(): - agent = DDPGLearner( - input_shape=(maneuver.get_reduced_feature_length(), ), - nb_actions=2, - gamma=0.99, - nb_steps_warmup_critic=200, - nb_steps_warmup_actor=200, - lr=1e-3) - agent.load_model("backends/trained_policies/" + key + "/" + key + - "_weights.h5f") - maneuver.set_low_level_trained_policy(agent.predict) - maneuver._cost_weights = (20.0 * 1e-3, 1.0 * 1e-3, 0.25 * 1e-3, - 1.0 * 1e-3, 100.0 * 1e-3, 0.1 * 1e-3, - 0.25 * 1e-3, 0.1 * 1e-3) + trained_policy_path = "backends/trained_policies/" + key + "/" + critic_file_exists = os.path.isfile(trained_policy_path + key + "_weights_critic.h5f") + actor_file_exists = os.path.isfile(trained_policy_path + key + "_weights_actor.h5f") + + if actor_file_exists and critic_file_exists: + agent = DDPGLearner( + input_shape=(maneuver.get_reduced_feature_length(),), + nb_actions=2, + gamma=0.99, + nb_steps_warmup_critic=200, + nb_steps_warmup_actor=200, + lr=1e-3) + agent.load_model(trained_policy_path + key + "_weights.h5f") + maneuver.set_low_level_trained_policy(agent.predict) + + elif not critic_file_exists and actor_file_exists: + print("\n Warning: unable to load the low-level policy of \"" + key + + "\". the file of critic weights have to be located in the same " + + "directory of the actor weights file; the manual policy will be used instead.\n") + + else: + print("\n Warning: the trained low-level policy of \"" + key + + "\" does not exists; the manual policy will be used.\n") if self.config["method"] == "mcts": maneuver.timeout = np.inf diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py index 78f7a63a67dd19ce93ea0f08e5ae0c9749616f3f..01e210db99ad439efea33561ba53cef298bf8193 100644 --- a/options/simple_intersection/maneuver_base.py +++ b/options/simple_intersection/maneuver_base.py @@ -20,7 +20,7 @@ class ManeuverBase(EpisodicEnvBase): learning_mode = 'training' #: timeout (i.e., time horizon for termination) - # By default, the time-out horizon is 1 as in Paxton et. al (2017). + # By default, the time-out horizon is 1. timeout = 1 #: the option specific weight vector for cost of driving, which is @@ -29,8 +29,9 @@ class ManeuverBase(EpisodicEnvBase): # _extra_action_weights_flag = True); note that a cost is defined # as a negative reward, so a cost will be summed up to the reward # with subtraction. - _cost_weights = (1.0 * 1e-3, 1.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, - 100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3) + # TODO: remove or to provide additional functionality, keep _cost_weights in ManeuverBase here (see other TODOs in simple_intersection_env regarding _cost_weights). + _cost_weights = (10.0 * 1e-3, 10.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, + 100.0 * 1e-3, 0.1 * 1e-3, 0.05 * 1e-3, 0.1 * 1e-3) _extra_r_terminal = None _extra_r_on_timeout = None @@ -38,7 +39,7 @@ class ManeuverBase(EpisodicEnvBase): #: the flag being False when _cost_weights is used without # modification; If True, then the action parts of _cost_weights # are increased for some edge cases (see the step method). - _extra_action_weights_flag = True + _extra_action_weights_flag = False #: the extra weights on the actions added to _cost_weights # for some edge cases when _extra_action_weights_flag = True. @@ -153,8 +154,7 @@ class ManeuverBase(EpisodicEnvBase): # in this case, no additional reward by Default # (i.e., self._extra_r_terminal = None by default). self._terminal_reward_superposition(self._extra_r_terminal) - info[ - 'maneuver_termination_reason'] = 'extra_termination_condition' + info['maneuver_termination_reason'] = 'extra_termination_condition' if self.timeout_happened: if self._give_reward_on_timeout: # in this case, no additional reward by Default @@ -292,9 +292,8 @@ class ManeuverBase(EpisodicEnvBase): raise NotImplemented(self.__class__.__name__ + ".generate_learning_scenario is not implemented.") - def generate_validation_scenario( - self - ): # Override this method in the subclass if some customization is needed. + # Override this method in the subclass if some customization is needed. + def generate_validation_scenario(self): self.generate_learning_scenario() self._enable_low_level_training_properties = False @@ -334,8 +333,7 @@ class ManeuverBase(EpisodicEnvBase): Returns True if the condition is satisfied, and False otherwise. """ - return not (self.env.termination_condition or self.violation_happened) and \ - self.extra_initiation_condition + return not self.termination_condition and self.extra_initiation_condition @property def extra_initiation_condition(self): diff --git a/options/simple_intersection/maneuvers.py b/options/simple_intersection/maneuvers.py index 7b6ccf4ac243ff0be013a8164a3263a45ae895d1..67c8318d6c57801566d7693a60ceb2212dcbe64e 100644 --- a/options/simple_intersection/maneuvers.py +++ b/options/simple_intersection/maneuvers.py @@ -15,31 +15,125 @@ class KeepLane(ManeuverBase): self._target_lane = self.env.ego.APs['lane'] def _init_LTL_preconditions(self): - self._LTL_preconditions.append(LTLProperty("G ( not veh_ahead )", 0)) + self._LTL_preconditions.append( - LTLProperty("G ( not stopped_now )", 200, + LTLProperty("G ( not stopped_now )", 100, self._enable_low_level_training_properties)) + self._LTL_preconditions.append( LTLProperty( "G ( (lane and target_lane) or (not lane and not target_lane) )", - 200, self._enable_low_level_training_properties)) + 100, self._enable_low_level_training_properties)) def generate_learning_scenario(self): self.generate_scenario( enable_LTL_preconditions=False, - ego_pos_range=(rd.hlanes.start_pos, rd.hlanes.end_pos), + ego_pos_range=(rd.intersection_width_w_offset, rd.hlanes.end_pos), ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6), + v_max_multiplier=0.75, ego_heading_towards_lane_centre=True) # the goal reward and termination is led by the SimpleIntersectionEnv - self.env._terminate_in_goal = True - self.env._reward_in_goal = 200 + self.env._terminate_in_goal = False + self.env._reward_in_goal = None self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True + + def generate_validation_scenario(self): + self.generate_scenario( + enable_LTL_preconditions=False, + ego_pos_range=(rd.hlanes.start_pos, 0), + ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6), + ego_heading_towards_lane_centre=True) + # the goal reward and termination is led by the SimpleIntersectionEnv + self.env._terminate_in_goal = False + self.env._reward_in_goal = None @staticmethod def _features_dim_reduction(features_tuple): - return extract_ego_features(features_tuple, 'pos_near_stop_region', - 'v', 'v_ref', 'e_y', 'psi', 'theta', 'acc', - 'psi_dot') + return extract_ego_features(features_tuple, 'v', 'v_ref', 'e_y', 'psi', 'v tan(psi/L)', 'theta', 'lane', 'acc', 'psi_dot') + + @property + def extra_termination_condition(self): + if self._enable_low_level_training_properties: # activated only for the low-level training. + if (self.env.ego.v < self._v_ref / 5) and self.env.ego.acc < 0: + self._extra_r_terminal = -100 + return True + else: + self._extra_r_terminal = None + return False + + return False + + +class Halt(ManeuverBase): + + _terminate_in_goal = True + _reward_in_goal = None + + _penalty_in_violation = None + _ego_pos_range = (rd.intersection_width_w_offset, rd.hlanes.end_pos) + + def _init_param(self): + self._v_ref = 0 if self._enable_low_level_training_properties else rd.speed_limit + self._target_lane = self.env.ego.APs['lane'] + + def _init_LTL_preconditions(self): + + self._LTL_preconditions.append( + LTLProperty( + "G ( (veh_ahead and before_but_close_to_stop_region) U highest_priority )", + None, not self._enable_low_level_training_properties)) + + self._LTL_preconditions.append( + LTLProperty("G ( not stopped_now )", self._penalty(self._reward_in_goal), + not self._enable_low_level_training_properties)) + + self._LTL_preconditions.append( + LTLProperty( + "G ( (lane and target_lane) or (not lane and not target_lane) )", + 100, self._enable_low_level_training_properties)) + + def generate_learning_scenario(self): + self.generate_scenario( + ego_pos_range=self._ego_pos_range, + ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6), + ego_heading_towards_lane_centre=True) + self.env._terminate_in_goal = False + self.env._reward_in_goal = None + self._reward_in_goal = 200 + self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True + + def generate_validation_scenario(self): + self._ego_pos_range = (rd.hlanes.start_pos, rd.hlanes.end_pos) + self.generate_learning_scenario() + + def _low_level_manual_policy(self): + return self.env.aggressive_driving_policy(EGO_INDEX) + + @staticmethod + def _features_dim_reduction(features_tuple): + return extract_ego_features(features_tuple, 'v', 'v_ref', 'e_y', 'psi', + 'v tan(psi/L)', 'theta', 'lane', 'acc', 'psi_dot') + + @property + def extra_termination_condition(self): + if self._enable_low_level_training_properties: # activated only for the low-level training. + if self.env.ego.APs['stopped_now']: + if self._reward_in_goal is not None: + self._extra_r_terminal = self._reward_in_goal + self._extra_r_terminal *= np.exp(- pow(self.env.ego.theta, 2) + - pow(self.env.ego.y - rd.hlanes.centres[self._target_lane], 2) + - 0.25 * pow(self.env.ego.psi, 2)) + else: + self._extra_r_terminal = None + return True + + else: + self._extra_r_terminal = None + return False + + return False class Stop(ManeuverBase): @@ -47,6 +141,8 @@ class Stop(ManeuverBase): _terminate_in_goal = True _reward_in_goal = None + _penalty_in_violation = None + def _init_param(self): self._set_v_ref() self._target_lane = self.env.ego.APs['lane'] @@ -54,34 +150,33 @@ class Stop(ManeuverBase): def _init_LTL_preconditions(self): self._LTL_preconditions.append( LTLProperty("G ( not has_stopped_in_stop_region )", - self._penalty(self._reward_in_goal))) + self._penalty(self._reward_in_goal), not self._enable_low_level_training_properties)) + # before_intersection rather than "before_but_close_to_stop_region or in_stop_region"? self._LTL_preconditions.append( LTLProperty( "G ( (before_but_close_to_stop_region or in_stop_region) U has_stopped_in_stop_region )", - 0)) + self._penalty_in_violation)) self._LTL_preconditions.append( - LTLProperty("G ( not stopped_now U in_stop_region )", 200, + LTLProperty("G ( not stopped_now U in_stop_region )", 100, self._enable_low_level_training_properties)) self._LTL_preconditions.append( LTLProperty( "G ( (lane and target_lane) or (not lane and not target_lane) )", - 200, self._enable_low_level_training_properties)) + 100, self._enable_low_level_training_properties)) def _update_param(self): self._set_v_ref() def _set_v_ref(self): self._v_ref = rd.speed_limit + #if self._enable_low_level_training_properties: x = self.env.ego.x - if x <= rd.hlanes.near_stop_region: - self._v_ref = rd.speed_limit - elif x <= rd.hlanes.stop_region_centre: - self._v_ref = -(rd.speed_limit / abs(rd.hlanes.near_stop_region) - ) * (x - rd.hlanes.stop_region_centre) - else: + if rd.hlanes.near_stop_region < x <= rd.hlanes.stop_region_centre: + self._v_ref = -(rd.speed_limit / abs(rd.hlanes.near_stop_region)) * (x - rd.hlanes.stop_region_centre) + elif x > rd.hlanes.stop_region_centre: self._v_ref = 0 def generate_learning_scenario(self): @@ -89,9 +184,12 @@ class Stop(ManeuverBase): ego_pos_range=(rd.hlanes.near_stop_region, -rd.intersection_width_w_offset / 2), ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6), + v_max_multiplier=0.75, ego_heading_towards_lane_centre=True) self._reward_in_goal = 200 + self._penalty_in_violation = 150 self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True def _low_level_manual_policy(self): return self.env.aggressive_driving_policy(EGO_INDEX) @@ -99,79 +197,115 @@ class Stop(ManeuverBase): @staticmethod def _features_dim_reduction(features_tuple): return extract_ego_features(features_tuple, 'pos_near_stop_region', - 'v', 'v_ref', 'e_y', 'psi', 'theta', 'acc', - 'psi_dot', 'not_in_stop_region') + 'v', 'v_ref', 'e_y', 'psi', 'v tan(psi/L)', 'theta', 'lane', 'acc', + 'psi_dot', 'pos_stop_region', 'not_in_stop_region') + + @property + def extra_termination_condition(self): + if self._enable_low_level_training_properties: # activated only for the low-level training. + if self.env.ego.APs['has_stopped_in_stop_region']: + if self._reward_in_goal is not None: + self._extra_r_terminal = self._reward_in_goal + self._extra_r_terminal *= np.exp(- pow(self.env.ego.theta, 2) + - pow(self.env.ego.y - rd.hlanes.centres[self._target_lane], 2) + - 0.25 * pow(self.env.ego.psi, 2)) + else: + self._extra_r_terminal = None + return True + + elif (rd.speed_limit / 5 < self._v_ref) and \ + (self.env.ego.v < self._v_ref / 2) and self.env.ego.acc < 0: + self._extra_r_terminal = -100 + return True + + else: + self._extra_r_terminal = None + return False + + return False class Wait(ManeuverBase): - _terminate_in_goal = True _reward_in_goal = None + _terminate_in_goal = True def _init_LTL_preconditions(self): + self._LTL_preconditions.append( - LTLProperty( - "G ( (in_stop_region and stopped_now) U highest_priority )", - 0)) + LTLProperty("G ( (in_stop_region and stopped_now) and not (highest_priority and intersection_is_clear))", + None, not self._enable_low_level_training_properties)) # not available in low-level training... self._LTL_preconditions.append( LTLProperty("G ( not (in_intersection and highest_priority) )", self._penalty(self._reward_in_goal))) + self._LTL_preconditions.append( + LTLProperty( + "G ( in_stop_region U (highest_priority and intersection_is_clear) )", 150, self._enable_low_level_training_properties)) + + self._LTL_preconditions.append( + LTLProperty( + "G ( (lane and target_lane) or (not lane and not target_lane) )", + 100, self._enable_low_level_training_properties)) + def _init_param(self): - ego = self.env.ego - self._v_ref = rd.speed_limit if self.env.ego.APs[ - 'highest_priority'] else 0 - self._target_lane = ego.APs['lane'] - self._ego_stop_count = 0 + self._v_ref = 0 #if self._enable_low_level_training_properties else rd.speed_limit + self._target_lane = self.env.ego.APs['lane'] - def _update_param(self): - if self.env.ego.APs['highest_priority']: - self._v_ref = rd.speed_limit - if self._enable_low_level_training_properties: - if self.env.n_others_with_higher_priority == 0: - self._ego_stop_count += 1 + def _low_level_manual_policy(self): + return (0, 0) # Do nothing during "Wait" but just wait until the highest priority is given. - def generate_learning_scenario(self): - n_others = np.random.randint(0, 3) - self.generate_scenario( - enable_LTL_preconditions=True, - timeout=62, - n_others_range=(n_others, n_others), - ego_pos_range=rd.hlanes.stop_region, - n_others_stopped_in_stop_region=n_others, - ego_v_upper_lim=0, - ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6), - ego_heading_towards_lane_centre=True) +# @staticmethod +# def _features_dim_reduction(features_tuple): +# return extract_ego_features( +# features_tuple, 'v', 'v_ref', 'e_y', 'psi', 'v tan(psi/L)', 'theta', 'lane', 'acc', 'psi_dot', +# 'pos_stop_region', 'intersection_is_clear', 'highest_priority') - max_waited_count = 0 - for veh in self.env.vehs[1:]: - max_waited_count = max(max_waited_count, veh.waited_count) - self._extra_action_weights_flag = False - self.env.ego.waited_count = np.random.randint(0, max_waited_count + 21) - self.env.init_APs(False) +class Left(ManeuverBase): - self._reward_in_goal = 200 - self._extra_r_on_timeout = -200 - self._enable_low_level_training_properties = True - self._ego_stop_count = 0 + min_y_distance = rd.hlanes.width / 4 + + _terminate_in_goal = True + _reward_in_goal = None + + def _init_param(self): + self._v_ref = rd.speed_limit + self._target_lane = False + self._terminate_in_goal = True @property - def extra_termination_condition(self): - if self._enable_low_level_training_properties: # activated only for the low-level training. - if self._ego_stop_count >= 50: - self._extra_r_terminal = -200 - return True - else: - self._extra_r_terminal = None - return False + def goal_achieved(self): + ego = self.env.ego + APs = self.env.ego.APs + on_other_lane = APs['lane'] == self._target_lane + achieved_y_displacement = np.sign(ego.y) * \ + (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance + return on_other_lane and APs['on_route'] and \ + achieved_y_displacement and APs['parallel_to_lane'] + + @property + def extra_initiation_condition(self): + return self.env.ego.APs['lane'] @staticmethod def _features_dim_reduction(features_tuple): - return extract_ego_features( - features_tuple, 'v', 'v_ref', 'psi', 'theta', 'acc', 'psi_dot', - 'pos_stop_region', 'intersection_is_clear', 'highest_priority') + return extract_ego_features(features_tuple, 'v', 'v_ref', 'e_y', 'psi', + 'v tan(psi/L)', 'theta', 'lane', 'acc', + 'psi_dot') + + +class Right(Left): + + def _init_param(self): + self._v_ref = rd.speed_limit + self._target_lane = True + self._terminate_in_goal = True + + @property + def extra_initiation_condition(self): + return not self.env.ego.APs['lane'] class ChangeLane(ManeuverBase): @@ -183,22 +317,25 @@ class ChangeLane(ManeuverBase): _violation_penalty_in_low_level_training = None - high_level_extra_reward = -20 + high_level_extra_reward = -50 def _init_param(self): self._v_ref = rd.speed_limit self._target_lane = not self.env.ego.APs['lane'] - self._terminate_in_goal = True def _init_LTL_preconditions(self): self._LTL_preconditions.append( - LTLProperty("G ( on_route and not over_speed_limit )", - self._violation_penalty_in_low_level_training, - self._enable_low_level_training_properties)) + LTLProperty("G ( on_route and not over_speed_limit )", + self._violation_penalty_in_low_level_training, + self._enable_low_level_training_properties)) + self._LTL_preconditions.append( - LTLProperty("G ( not stopped_now )", - self._violation_penalty_in_low_level_training, - self._enable_low_level_training_properties)) + LTLProperty("G ( not stopped_now )", + 100, self._enable_low_level_training_properties)) + + self._LTL_preconditions.append( + LTLProperty("G ( not in_intersection and not in_stop_region )", + None, not self._enable_low_level_training_properties)) # activated only for the high-level case. @property def goal_achieved(self): @@ -206,9 +343,9 @@ class ChangeLane(ManeuverBase): APs = self.env.ego.APs on_other_lane = APs['lane'] == self._target_lane achieved_y_displacement = np.sign(ego.y) * \ - (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance + (ego.y - rd.hlanes.centres[APs['target_lane']]) >= - self.min_y_distance return on_other_lane and APs['on_route'] and \ - achieved_y_displacement and APs['parallel_to_lane'] + achieved_y_displacement and APs['parallel_to_lane'] def _low_level_manual_policy(self): return self.env.aggressive_driving_policy(EGO_INDEX) @@ -217,16 +354,28 @@ class ChangeLane(ManeuverBase): self.generate_scenario( enable_LTL_preconditions=False, timeout=15, - ego_pos_range=(rd.hlanes.start_pos, rd.hlanes.end_pos), + ego_pos_range=(rd.intersection_width_w_offset, rd.hlanes.end_pos), ego_lane=np.random.choice([0, 1]), - ego_perturb_lim=(rd.hlanes.width / 5, np.pi / 6)) + ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6), + v_max_multiplier=0.75) + + # print('our range was %s, %s, ego at %s' % (before_intersection, after_intersection, self.env.ego.x)) + self._reward_in_goal = 200 + self._violation_penalty_in_low_level_training = 150 + self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True + + def generate_validation_scenario(self): + self.generate_scenario( + enable_LTL_preconditions=False, + ego_pos_range=(rd.hlanes.start_pos, rd.hlanes.end_pos), + ego_perturb_lim=(rd.hlanes.width / 4, np.pi / 6)) + # print('our range was %s, %s, ego at %s' % (before_intersection, after_intersection, self.env.ego.x)) self._reward_in_goal = 200 - self._violation_penalty_in_low_level_training = 200 + self._violation_penalty_in_low_level_training = 150 self._enable_low_level_training_properties = True - # TODO: It is not a good idea to specify features by numbers, as the list - # of features is ever changing. We should specify them by strings. @staticmethod def _features_dim_reduction(features_tuple): return extract_ego_features(features_tuple, 'v', 'v_ref', 'e_y', 'psi', @@ -237,49 +386,84 @@ class ChangeLane(ManeuverBase): class Follow(ManeuverBase): _target_veh_i = None - _penalty_for_out_of_follow_range = None + _penalty_for_out_of_range = None + _penalty_for_change_lane = None def _init_LTL_preconditions(self): self._LTL_preconditions.append( - LTLProperty("G ( veh_ahead )", - self._penalty_for_out_of_follow_range)) + LTLProperty("G ( veh_ahead )", self._penalty_for_out_of_range)) self._LTL_preconditions.append( - LTLProperty( - "G ( (lane and target_lane) or (not lane and not target_lane) )", - self._penalty_for_out_of_follow_range)) + LTLProperty( + "G ( (lane and target_lane) or (not lane and not target_lane) )", + self._penalty_for_change_lane)) - self._LTL_preconditions.append( - LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200, - self._enable_low_level_training_properties)) + # self._LTL_preconditions.append( + # LTLProperty("G ( not stopped_now U veh_ahead_stopped_now)", 200, + # self._enable_low_level_training_properties)) self._LTL_preconditions.append( - LTLProperty("G ( not veh_ahead_too_close )", 200, - self._enable_low_level_training_properties)) + LTLProperty("G ( not veh_ahead_too_close )", self._penalty_for_out_of_range, + self._enable_low_level_training_properties)) def generate_learning_scenario(self): self.generate_scenario( enable_LTL_preconditions=False, n_others_range=(1, 1), - ego_perturb_lim=(rd.hlanes.width / 2, np.pi / 4), + v_max_multiplier=0.75, + ego_perturb_lim=(0, 0), veh_ahead_scenario=True) self.env._terminate_in_goal = False - self._penalty_for_out_of_follow_range = 200 + self._penalty_for_out_of_range = 200 + self._penalty_for_change_lane = 170 self._enable_low_level_training_properties = True + self._extra_action_weights_flag = True + + def generate_validation_scenario(self): + self.generate_learning_scenario() + + def _init_param(self): + self._set_v_ref() def _update_param(self): + self._set_v_ref() + + def _set_v_ref(self): + #if self._enable_low_level_training_properties: self._target_veh_i, _ = self.env.get_V2V_distance() + if self._target_veh_i is not None: + self._v_ref = self.env.vehs[self._target_veh_i].v + else: + self._v_ref = 0 + #else: + # self._v_ref = rd.speed_limit + def _low_level_manual_policy(self): return self.env.aggressive_driving_policy(EGO_INDEX) + @property + def extra_termination_condition(self): + # APs = self.env.ego.APs + + if self._target_veh_i is None: + return False + + #elif not self._enable_low_level_training_properties: # activated only for the high-level training. + # if (APs['in_stop_region'] or APs['before_but_close_to_stop_region']) \ + # and (self.env.vehs[self._target_veh_i].APs['in_intersection'] or + # self.env.vehs[self._target_veh_i].x > 0): + # return True + # else: + return False + def _features_dim_reduction(self, features_tuple): ego_features = extract_ego_features( - features_tuple, 'v', 'v_ref', 'e_y', 'psi', 'v tan(psi/L)', - 'theta', 'lane', 'e_y,lane', 'acc', 'psi_dot') + features_tuple, 'pos_near_stop_region', 'v', 'v_ref', 'e_y', 'psi', 'v tan(psi/L)', + 'theta', 'lane', 'acc', 'psi_dot') + if self._target_veh_i is not None: return ego_features + extract_other_veh_features( - features_tuple, self._target_veh_i, 'rel_x', 'rel_y', 'v', - 'acc') + features_tuple, self._target_veh_i, 'rel_x', 'rel_y', 'v', 'acc') else: return ego_features + (0.0, 0.0, 0.0, 0.0)