diff --git a/backends/kerasrl_learner.py b/backends/kerasrl_learner.py index a24c4daa9bd45c548e3278597547ef768db6698a..3d7973473615e87a43df3b4734a8e6afbc6eaf98 100644 --- a/backends/kerasrl_learner.py +++ b/backends/kerasrl_learner.py @@ -8,7 +8,7 @@ from keras.callbacks import TensorBoard from rl.agents import DDPGAgent, DQNAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess -from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy +from rl.policy import GreedyQPolicy, EpsGreedyQPolicy, MaxBoltzmannQPolicy from rl.callbacks import ModelIntervalCheckpoint @@ -229,6 +229,7 @@ class DQNLearner(LearnerBase): model=None, policy=None, memory=None, + test_policy=None, **kwargs): """The constructor which sets the properties of the class. @@ -236,8 +237,8 @@ class DQNLearner(LearnerBase): input_shape: Shape of observation space, e.g (10,); nb_actions: number of values in action space; model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given - policy: KerasRL Policy. Uses default SequentialMemory if not given - memory: KerasRL Memory. Uses default BoltzmannQPolicy if not given + policy: KerasRL Policy. Uses default MaxBoltzmannQPolicy if not given + memory: KerasRL Memory. Uses default SequentialMemory if not given **kwargs: other optional key-value arguments with defaults defined in property_defaults """ super(DQNLearner, self).__init__(input_shape, nb_actions, **kwargs) @@ -255,12 +256,14 @@ class DQNLearner(LearnerBase): model = self.get_default_model() if policy is None: policy = self.get_default_policy() + if test_policy is None: + test_policy = self.get_default_test_policy() if memory is None: memory = self.get_default_memory() self.low_level_policies = low_level_policies - self.agent_model = self.create_agent(model, policy, memory) + self.agent_model = self.create_agent(model, policy, memory, test_policy) def get_default_model(self): """Creates the default model. @@ -269,7 +272,6 @@ class DQNLearner(LearnerBase): """ model = Sequential() model.add(Flatten(input_shape=(1, ) + self.input_shape)) - #model.add(Dense(64)) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(64)) @@ -283,7 +285,10 @@ class DQNLearner(LearnerBase): return model def get_default_policy(self): - return MaxBoltzmannQPolicy(eps=0.3) + return RestrictedEpsGreedyQPolicy(0.3) + + def get_default_test_policy(self): + return RestrictedGreedyQPolicy() def get_default_memory(self): """Creates the default memory model. @@ -294,7 +299,7 @@ class DQNLearner(LearnerBase): limit=self.mem_size, window_length=self.mem_window_length) return memory - def create_agent(self, model, policy, memory): + def create_agent(self, model, policy, memory, test_policy): """Creates a KerasRL DDPGAgent with given components. Args: @@ -413,6 +418,86 @@ class DQNLearner(LearnerBase): return relevant +class RestrictedEpsGreedyQPolicy(EpsGreedyQPolicy): + """Implement the epsilon greedy policy + + Restricted Eps Greedy policy. + This policy ensures that it never chooses the action whose value is -inf + + """ + + def __init__(self, eps=.1): + super(RestrictedEpsGreedyQPolicy, self).__init__(eps) + + def select_action(self, q_values): + """Return the selected action + + # Arguments + q_values (np.ndarray): List of the estimations of Q for each action + + # Returns + Selection action + """ + assert q_values.ndim == 1 + nb_actions = q_values.shape[0] + index = list() + + for i in range(0, nb_actions): + if q_values[i] != -np.inf: + index.append(i) + + # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions + # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. + # In this case, we choose a policy randomly. + if len(index) < 1: + action = np.random.random_integers(0, nb_actions - 1) + + elif np.random.uniform() <= self.eps: + action = index[np.random.random_integers(0, len(index) - 1)] + + else: + action = np.argmax(q_values) + + return action + + +class RestrictedGreedyQPolicy(GreedyQPolicy): + """Implement the epsilon greedy policy + + Restricted Greedy policy. + This policy ensures that it never chooses the action whose value is -inf + + """ + + def select_action(self, q_values): + """Return the selected action + + # Arguments + q_values (np.ndarray): List of the estimations of Q for each action + + # Returns + Selection action + """ + assert q_values.ndim == 1 + nb_actions = q_values.shape[0] + restricted_q_values = list() + + for i in range(0, nb_actions): + if q_values[i] != -np.inf: + restricted_q_values.append(q_values[i]) + + # every q_value is -np.inf (this sometimes inevitably happens within the fit and test functions + # of kerasrl at the terminal stage as they force to call forward in Kerasrl-learner which calls this function. + # In this case, we choose a policy randomly. + if len(restricted_q_values) < 1: + action = np.random.random_integers(0, nb_actions - 1) + + else: + action = np.argmax(restricted_q_values) + + return action + + class DQNAgentOverOptions(DQNAgent): def __init__(self, model, diff --git a/env/env_base.py b/env/env_base.py index 3786c344c7531703c8a9239af7b6155b41f65583..4d28722233a77cfccd897e3d4d1ed7d6bb0f2c7d 100644 --- a/env/env_base.py +++ b/env/env_base.py @@ -25,7 +25,8 @@ class EpisodicEnvBase(GymCompliantEnvBase): # three types possible ('min', 'max', or 'sum'); # See _reward_superposition below. - terminal_reward_type = 'max' + # TODO: consider the case, where every terminal reward is None. Make this class have a default terminal value (not None) and use it in this case. + terminal_reward_type = 'min' #: If true, the maneuver terminates when the goal has been achieved. _terminate_in_goal = False @@ -140,13 +141,11 @@ class EpisodicEnvBase(GymCompliantEnvBase): def _reset_model_checker(self, AP): - self.__mc_AP = int(AP) - if self._LTL_preconditions_enable: for LTL_precondition in self._LTL_preconditions: LTL_precondition.reset_property() - if LTL_precondition.enabled: - LTL_precondition.check_incremental(self.__mc_AP) + + self._incremental_model_checking(AP) def _set_mc_AP(self, AP): self.__mc_AP = int(AP) diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py index 3855aea8e699fd521c89b8b4b51039572307e970..32f50efb43882ac34e5e35ac5245e50be649003e 100644 --- a/env/simple_intersection/simple_intersection_env.py +++ b/env/simple_intersection/simple_intersection_env.py @@ -271,7 +271,7 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): # stopped_car_scenario = bool(np.random.randint(0, 1)) TODO: this scenario may not work n_others_stopped_in_stop_region = np.random.randint( 0, min(3, n_others - stopped_car_scenario)) - veh_ahead_scenario = bool(np.random.randint(0, 1)) + veh_ahead_scenario = bool(np.random.randint(0, 1)) or veh_ahead_scenario if n_others_stopped_in_stop_region > min( n_others - stopped_car_scenario, 3): diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py index 275e654c3c79590f3985da2f7afaadb458bf5906..fbbc3ed19dac363c5966a2283f7988c42246fe74 100644 --- a/options/simple_intersection/maneuver_base.py +++ b/options/simple_intersection/maneuver_base.py @@ -332,8 +332,7 @@ class ManeuverBase(EpisodicEnvBase): Returns True if the condition is satisfied, and False otherwise. """ - return not (self.env.termination_condition or self.violation_happened) and \ - self.extra_initiation_condition + return not self.termination_condition and self.extra_initiation_condition @property def extra_initiation_condition(self):