Commit 125ba1e0 authored by Aravind Bk's avatar Aravind Bk
Browse files

version 1.0.0

parent f70e643c
This diff is collapsed.
We need some helpful information here
\ No newline at end of file
WiseMove is safe reinforcement learning framework that combines hierarchical reinforcement learning and model-checking using temporal logic constraints.
Requirements
============
* Python 3.6
* `requirements.txt <../requirements.txt>`_ for package list.
Installation
============
* Run the install script: :code:`scripts/install.sh`
Documentation
=============
* For documentation, we use *Sphinx*
* If `install.sh` was run, use :code:`scripts/generate_doc.sh launch` to view documentation or open `index.html <./documentation/index.html>`_ .
* If not, use :code:`scripts/generate_doc.sh build` to generate documentation first.
Replicate Results
=================
These are the minimum steps required to replicate the results for simple_intersection environment. For a detailed user guide, it is recommended to view the documentation.
* Run `scripts/install.sh`
* Low-level policies:
* To train all low-level policies from scratch: `python low_level_policy_main.py --train`
* To train a single low-level, for example wait: `python low_level_policy_main.py --option=wait --train`
* To test these trained low-level policies: `python low_level_policy_main.py --test --saved_policy_in_root`
* To test one of these trained low-level policies, for example wait: `python low_level_policy_main.py --option=wait --test --saved_policy_in_root`
* High-level policy:
* To train high-level policy from scratch using the given low-level policies: `python high_level_policy_main.py --train`
* To evaluate this trained high-level policy: `python high_level_policy_main.py --evaluate --saved_policy_in_root`
* To run MCTS using the high-level policy: `python mcts.py`
Coding Standards
================
We follow PEP8 style guidelines for coding and PEP257 for documentation.
It is not necessary to keep these in mind while coding, but before
submitting a pull request, do these two steps for each python file you
have modified.
1. :code:`yapf -i YOUR_MODIFIED_FILE.py`
2. :code:`docformatter --in-place YOUR_MODIFIED_FILE.py`
:code:`yapf` formats the code and :code:`docformatter` formats the docstrings.
\ No newline at end of file
from .manual_policy import ManualPolicy
from .mcts_learner import MCTSLearner
from .rl_controller import RLController
from .kerasrl_learner import DDPGLearner, DQNLearner
from .online_mcts_controller import OnlineMCTSController
\ No newline at end of file
from .learner_base import LearnerBase
# TODO: make sure that the package for PPO2 is installed.
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy
import numpy as np
class PPO2Agent(LearnerBase):
def __init__(self,
input_shape,
nb_actions,
env,
policy=None,
tensorboard=False,
log_path="./logs",
**kwargs):
"""The constructor which sets the properties of the class.
Args:
input_shape: Shape of observation space, e.g (10,);
nb_actions: number of values in action space;
env: env on which the agent learns
policy: stable_baselines Policy object. default is MlpPolicy
tensorboard: whether to integrate tensorboard or not
log_path="./logs",
**kwargs: other optional key-value arguments with defaults defined in property_defaults
"""
super(PPO2Agent, self).__init__(input_shape, nb_actions, **kwargs)
if policy is None:
policy = self.get_default_policy()
self.log_path = log_path
self.env = DummyVecEnv([lambda: env]) #PPO2 requried a vectorized environment for parallel training
self.agent_model = self.create_agent(policy, tensorboard)
def get_default_policy(self):
"""Creates the default policy.
Returns: stable_baselines Policy object. default is MlpPolicy
"""
return MlpPolicy
def create_agent(self, policy, tensorboard):
"""Creates a PPO agent
Returns:
stable_baselines PPO2 object
"""
if tensorboard:
return PPO2(policy, self.env, verbose=1, tensorboard_log=self.log_path)
else:
return PPO2(policy, self.env, verbose=1)
def fit(self,
env=None,
nb_steps=1000000,
visualize=False,
nb_max_episode_steps=200):
# PPO2 callback is only called each episode (not step) so cannot render whole episode
# To render each step, add self.env.render() in at Runner class method run() in stable_baselines ppo2.py
callback = self.__render_env_while_learning if visualize else None
self.agent_model.learn(total_timesteps=nb_steps, callback=callback)
@staticmethod
def __render_env_while_learning(_locals, _globals):
_locals['self'].env.render()
def save_weights(self, file_name="test_weights.h5f", overwrite=True):
self.agent_model.save(file_name)
def test_model(self,
env=None,
nb_episodes=50,
visualize=True,
nb_max_episode_steps=200):
episode_rewards = [0.0]
obs = self.env.reset()
current_episode = 1
current_step = 0
while current_episode <= nb_episodes:
# _states are only useful when using LSTM policies
action, _states = self.agent_model.predict(obs)
# here, action, rewards and dones are arrays
# because we are using vectorized env
obs, rewards, dones, info = self.env.step(action)
current_step += 1
if visualize:
self.env.render()
# Stats
episode_rewards[-1] += rewards[0]
if dones[0] or current_step > nb_max_episode_steps:
obs = self.env.reset()
print ("Episode ", current_episode, "reward: ", episode_rewards[-1])
episode_rewards.append(0.0)
current_episode += 1
current_step = 0
# Compute mean reward for the last 100 episodes
mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
print("Mean reward over last 100 episodes:", mean_100ep_reward)
def load_weights(self, file_name="test_weights.h5f"):
self.agent_model = PPO2.load(file_name)
def forward(self, observation):
return self.agent_model.predict(observation)
def set_environment(self, env):
self.env = DummyVecEnv([lambda: env])
self.agent_model.set_env(self.env)
from .policy_base import PolicyBase
class ControllerBase(PolicyBase):
"""Abstract class for controllers."""
def __init__(self, env, low_level_policies, start_node_alias):
self.env = env
self.low_level_policies = low_level_policies
# TODO: Move an intermediate class so that base class can be clean
self.current_node = self.low_level_policies[start_node_alias]
self.node_terminal_state_reached = False
self.controller_args_defaults = {}
def set_controller_args(self, **kwargs):
for (prop, default) in self.controller_args_defaults.items():
setattr(self, prop, kwargs.get(prop, default))
def can_transition(self):
"""Returns boolean signifying whether we can transition. To be
implemented in subclass.
"""
raise NotImplemented(self.__class__.__name__ + \
"can_transition is not implemented.")
def do_transition(self, observation):
"""Do a transition, assuming we can transition. To be
implemented in subclass.
Args:
observation: final observation from episodic step
"""
raise NotImplemented(self.__class__.__name__ + \
"do_transition is not implemented.")
def set_current_node(self, node_alias):
"""Sets the current node which is being executed
Args:
node: node alias of the node to be set
"""
raise NotImplemented(self.__class__.__name__ + \
"set_current_node is not implemented.")
# TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
''' Executes the current node until node termination condition is reached
Returns state at end of node execution, total reward, epsiode_termination_flag, info
'''
def step_current_node(self, visualize_low_level_steps=False):
total_reward = 0
self.node_terminal_state_reached = False
while not self.node_terminal_state_reached:
observation, reward, terminal, info = self.low_level_step_current_node()
if visualize_low_level_steps:
self.env.render()
total_reward += reward
total_reward += self.current_node.high_level_extra_reward
# TODO for info
return observation, total_reward, self.env.termination_condition, info
# TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition
has been reached.
Returns state after one step, step reward, episode_termination_flag, info
'''
def low_level_step_current_node(self):
u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple())
feature, R, terminal, info = self.current_node.step(u_ego)
self.node_terminal_state_reached = terminal
return self.env.get_features_tuple(), R, self.env.termination_condition, info
from .learner_base import LearnerBase
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from rl.agents import DDPGAgent, DQNAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from rl.policy import BoltzmannQPolicy, MaxBoltzmannQPolicy
from rl.callbacks import ModelIntervalCheckpoint
import numpy as np
class DDPGLearner(LearnerBase):
def __init__(self,
input_shape=(48, ),
nb_actions=2,
actor=None,
critic=None,
critic_action_input=None,
memory=None,
random_process=None,
**kwargs):
"""The constructor which sets the properties of the class.
Args:
input_shape: Shape of observation space, e.g (10,);
nb_actions: number of values in action space;
actor: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given
critic: Keras Model of critic which takes concatenation of observation and action and outputs a single
value. Uses default if not given
critic_action_input: Keras Input which was used in creating action input of the critic model.
Uses default critic and action_input if not specified
memory: KerasRL Memory. Uses default SequentialMemory if not given
random_process: KerasRL random process. Uses default OrnsteinUhlenbeckProcess if not given
**kwargs: other optional key-value arguments with defaults defined in property_defaults
"""
super(DDPGLearner, self).__init__(input_shape, nb_actions, **kwargs)
property_defaults = {
"mem_size": 100000, # size of memory
"mem_window_length": 1, # window length of memory
"oup_theta": 0.15, # OrnsteinUhlenbeckProcess theta
"oup_mu": 0, # OrnsteinUhlenbeckProcess mu
"oup_sigma": 1, # OrnsteinUhlenbeckProcess sigma
"oup_sigma_min": 0.5, # OrnsteinUhlenbeckProcess sigma min
"oup_annealing_steps": 500000, # OrnsteinUhlenbeckProcess n-step annealing
"nb_steps_warmup_critic": 100, # steps for critic to warmup
"nb_steps_warmup_actor": 100, # steps for actor to warmup
"target_model_update": 1e-3 # target model update frequency
}
for (prop, default) in property_defaults.items():
setattr(self, prop, kwargs.get(prop, default))
if actor is None:
actor = self.get_default_actor_model()
if critic is None or critic_action_input is None:
critic, critic_action_input = self.get_default_critic_model()
if memory is None:
memory = self.get_default_memory()
if random_process is None:
random_process = self.get_default_randomprocess()
#TODO: Add output scaling
self.agent_model = self.create_agent(
actor, critic, critic_action_input, memory, random_process)
def get_default_actor_model(self):
"""Creates the default actor model.
Returns: Keras Model object of actor
"""
actor = Sequential()
actor.add(Flatten(input_shape=(1, ) + self.input_shape))
actor.add(Dense(64, use_bias=False))
actor.add(Activation('relu'))
actor.add(Dense(64, use_bias=False))
actor.add(Activation('relu'))
actor.add(Dense(self.nb_actions, use_bias=True))
actor.add(Activation('tanh'))
# print(actor.summary())
return actor
def get_default_critic_model(self):
"""Creates the default critic model.
Returns: Keras Model object of critic
"""
action_input = Input(shape=(self.nb_actions, ), name='action_input')
observation_input = Input(
shape=(1, ) + self.input_shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(64, use_bias=False)(x)
x = Activation('relu')(x)
x = Dense(64, use_bias=False)(x)
x = Activation('relu')(x)
x = Dense(64, use_bias=False)(x)
x = Activation('relu')(x)
x = Dense(1, use_bias=True)(x)
#x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
# print(critic.summary())
return critic, action_input
def get_default_randomprocess(self):
"""Creates the default random process model.
Returns: KerasRL OrnsteinUhlenbeckProcess object
"""
random_process = OrnsteinUhlenbeckProcess(
size=self.nb_actions,
theta=self.oup_theta,
mu=self.oup_mu,
sigma=self.oup_sigma,
sigma_min=self.oup_sigma_min,
n_steps_annealing=self.oup_annealing_steps)
return random_process
def get_default_memory(self):
"""Creates the default memory model.
Returns: KerasRL SequentialMemory object
"""
memory = SequentialMemory(
limit=self.mem_size, window_length=self.mem_window_length)
return memory
def create_agent(self, actor, critic, critic_action_input, memory,
random_process):
"""Creates a KerasRL DDPGAgent with given components.
Args:
actor: Keras Model of actor which takes observation as input and outputs actions.
critic: Keras Model of critic that takes concatenation of observation and action and outputs a single value.
critic_action_input: Keras Input which was used in creating action input of the critic model.
memory: KerasRL Memory.
random_process: KerasRL random process.
Returns:
KerasRL DDPGAgent object
"""
agent = DDPGAgent(
nb_actions=self.nb_actions,
actor=actor,
critic=critic,
critic_action_input=critic_action_input,
memory=memory,
nb_steps_warmup_critic=self.nb_steps_warmup_critic,
nb_steps_warmup_actor=self.nb_steps_warmup_actor,
random_process=random_process,
gamma=self.gamma,
target_model_update=1e-3)
# TODO: give params like lr_actor and lr_critic to set different lr of Actor and Critic.
agent.compile([Adam(lr=self.lr*1e-2, clipnorm=1.), Adam(lr=self.lr, clipnorm=1.)], metrics=['mae'])
return agent
def train(self,
env,
nb_steps=1000000,
visualize=False,
verbose=1,
log_interval=10000,
nb_max_episode_steps=200,
model_checkpoints=False,
checkpoint_interval=100000,
tensorboard=False):
callbacks = []
if model_checkpoints:
callbacks += [ModelIntervalCheckpoint('./checkpoints/checkpoint_weights.h5f', interval=checkpoint_interval)]
if tensorboard:
callbacks += [TensorBoard(log_dir='./logs')]
self.agent_model.fit(
env,
nb_steps=nb_steps,
visualize=visualize,
verbose=verbose,
log_interval=log_interval,
nb_max_episode_steps=nb_max_episode_steps,
callbacks=callbacks)
def save_model(self, file_name="test_weights.h5f", overwrite=True):
self.agent_model.save_weights(file_name, overwrite=True)
def test_model(self,
env,
nb_episodes=50,
visualize=True,
nb_max_episode_steps=200):
self.agent_model.test(
env,
nb_episodes=nb_episodes,
visualize=visualize,
nb_max_episode_steps=nb_max_episode_steps)
def load_model(self, file_name="test_weights.h5f"):
self.agent_model.load_weights(file_name)
def predict(self, observation):
return self.agent_model.forward(observation)
class DQNLearner(LearnerBase):
def __init__(self,
input_shape=(48, ),
nb_actions=5,
low_level_policies=None,
model=None,
policy=None,
memory=None,
**kwargs):
"""The constructor which sets the properties of the class.
Args:
input_shape: Shape of observation space, e.g (10,);
nb_actions: number of values in action space;
model: Keras Model of actor which takes observation as input and outputs actions. Uses default if not given
policy: KerasRL Policy. Uses default SequentialMemory if not given
memory: KerasRL Memory. Uses default BoltzmannQPolicy if not given
**kwargs: other optional key-value arguments with defaults defined in property_defaults
"""
super(DQNLearner, self).__init__(input_shape, nb_actions, **kwargs)
property_defaults = {
"mem_size": 100000, # size of memory
"mem_window_length": 1, # window length of memory
"target_model_update": 1e-3, # target model update frequency
"nb_steps_warmup": 100, # steps for model to warmup
}
for (prop, default) in property_defaults.items():
setattr(self, prop, kwargs.get(prop, default))
if model is None:
model = self.get_default_model()
if policy is None:
policy = self.get_default_policy()
if memory is None:
memory = self.get_default_memory()
self.low_level_policies = low_level_policies
self.agent_model = self.create_agent(model, policy, memory)
def get_default_model(self):
"""Creates the default model.
Returns: Keras Model object of actor
"""
model = Sequential()
model.add(Flatten(input_shape=(1, ) + self.input_shape))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(self.nb_actions))
model.add(Activation('linear'))
# print(model.summary())
return model
def get_default_policy(self):
return MaxBoltzmannQPolicy(eps=0.3)
def get_default_memory(self):
"""Creates the default memory model.
Returns: KerasRL SequentialMemory object
"""
memory = SequentialMemory(
limit=self.mem_size, window_length=self.mem_window_length)
return memory
def create_agent(self, model, policy, memory):
"""Creates a KerasRL DDPGAgent with given components.
Args:
model: Keras Model of model which takes observation as input and outputs discrete actions.
memory: KerasRL Memory.
Returns:
KerasRL DQN object
"""
agent = DQNAgentOverOptions(model=model, low_level_policies=self.low_level_policies,
nb_actions=self.nb_actions, memory=memory,
nb_steps_warmup=self.nb_steps_warmup, target_model_update=self.target_model_update,
policy=policy, enable_dueling_network=True)
agent.compile(Adam(lr=self.lr), metrics=['mae'])
return agent
def train(self,
env,
nb_steps=1000000,
visualize=False,
nb_max_episode_steps=200,
tensorboard=False,
model_checkpoints=False,
checkpoint_interval=10000):
callbacks = []
if model_checkpoints:
callbacks += [ModelIntervalCheckpoint('./checkpoints/checkpoint_weights.h5f', interval=checkpoint_interval)]
if tensorboard:
callbacks += [TensorBoard(log_dir='./logs')]
self.agent_model.fit(
env,
nb_steps=nb_steps,
visualize=visualize,
verbose=1,
nb_max_episode_steps=nb_max_episode_steps,
callbacks=callbacks)
def save_model(self, file_name="test_weights.h5f", overwrite=True):
self.agent_model.save_weights(file_name, overwrite=True)
# TODO: very environment specific. Make it general
def test_model(self,
env,
nb_episodes=5,
visualize=True,
nb_max_episode_steps=400,
success_reward_threshold = 100):
print("Testing for {} episodes".format(nb_episodes))
success_count = 0
termination_reason_counter = {}
for n in range(nb_episodes):
env.reset()
terminal = False
step = 0
episode_reward = 0
while not terminal and step <= nb_max_episode_steps:
if visualize:
env.render()
features, R, terminal, info = env.execute_controller_policy()