Commit 125ba1e0 authored by Aravind Bk's avatar Aravind Bk

version 1.0.0

parent f70e643c
This diff is collapsed.
We need some helpful information here
\ No newline at end of file
WiseMove is safe reinforcement learning framework that combines hierarchical reinforcement learning and model-checking using temporal logic constraints.
Requirements
============
* Python 3.6
* `requirements.txt <../requirements.txt>`_ for package list.
Installation
============
* Run the install script: :code:`scripts/install.sh`
Documentation
=============
* For documentation, we use *Sphinx*
* If `install.sh` was run, use :code:`scripts/generate_doc.sh launch` to view documentation or open `index.html <./documentation/index.html>`_ .
* If not, use :code:`scripts/generate_doc.sh build` to generate documentation first.
Replicate Results
=================
These are the minimum steps required to replicate the results for simple_intersection environment. For a detailed user guide, it is recommended to view the documentation.
* Run `scripts/install.sh`
* Low-level policies:
* To train all low-level policies from scratch: `python low_level_policy_main.py --train`
* To train a single low-level, for example wait: `python low_level_policy_main.py --option=wait --train`
* To test these trained low-level policies: `python low_level_policy_main.py --test --saved_policy_in_root`
* To test one of these trained low-level policies, for example wait: `python low_level_policy_main.py --option=wait --test --saved_policy_in_root`
* High-level policy:
* To train high-level policy from scratch using the given low-level policies: `python high_level_policy_main.py --train`
* To evaluate this trained high-level policy: `python high_level_policy_main.py --evaluate --saved_policy_in_root`
* To run MCTS using the high-level policy: `python mcts.py`
Coding Standards
================
We follow PEP8 style guidelines for coding and PEP257 for documentation.
It is not necessary to keep these in mind while coding, but before
submitting a pull request, do these two steps for each python file you
have modified.
1. :code:`yapf -i YOUR_MODIFIED_FILE.py`
2. :code:`docformatter --in-place YOUR_MODIFIED_FILE.py`
:code:`yapf` formats the code and :code:`docformatter` formats the docstrings.
\ No newline at end of file
from .manual_policy import ManualPolicy
from .mcts_learner import MCTSLearner
from .rl_controller import RLController
from .kerasrl_learner import DDPGLearner, DQNLearner
from .online_mcts_controller import OnlineMCTSController
\ No newline at end of file
from .learner_base import LearnerBase
# TODO: make sure that the package for PPO2 is installed.
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy
import numpy as np
class PPO2Agent(LearnerBase):
def __init__(self,
input_shape,
nb_actions,
env,
policy=None,
tensorboard=False,
log_path="./logs",
**kwargs):
"""The constructor which sets the properties of the class.
Args:
input_shape: Shape of observation space, e.g (10,);
nb_actions: number of values in action space;
env: env on which the agent learns
policy: stable_baselines Policy object. default is MlpPolicy
tensorboard: whether to integrate tensorboard or not
log_path="./logs",
**kwargs: other optional key-value arguments with defaults defined in property_defaults
"""
super(PPO2Agent, self).__init__(input_shape, nb_actions, **kwargs)
if policy is None:
policy = self.get_default_policy()
self.log_path = log_path
self.env = DummyVecEnv([lambda: env]) #PPO2 requried a vectorized environment for parallel training
self.agent_model = self.create_agent(policy, tensorboard)
def get_default_policy(self):
"""Creates the default policy.
Returns: stable_baselines Policy object. default is MlpPolicy
"""
return MlpPolicy
def create_agent(self, policy, tensorboard):
"""Creates a PPO agent
Returns:
stable_baselines PPO2 object
"""
if tensorboard:
return PPO2(policy, self.env, verbose=1, tensorboard_log=self.log_path)
else:
return PPO2(policy, self.env, verbose=1)
def fit(self,
env=None,
nb_steps=1000000,
visualize=False,
nb_max_episode_steps=200):
# PPO2 callback is only called each episode (not step) so cannot render whole episode
# To render each step, add self.env.render() in at Runner class method run() in stable_baselines ppo2.py
callback = self.__render_env_while_learning if visualize else None
self.agent_model.learn(total_timesteps=nb_steps, callback=callback)
@staticmethod
def __render_env_while_learning(_locals, _globals):
_locals['self'].env.render()
def save_weights(self, file_name="test_weights.h5f", overwrite=True):
self.agent_model.save(file_name)
def test_model(self,
env=None,
nb_episodes=50,
visualize=True,
nb_max_episode_steps=200):
episode_rewards = [0.0]
obs = self.env.reset()
current_episode = 1
current_step = 0
while current_episode <= nb_episodes:
# _states are only useful when using LSTM policies
action, _states = self.agent_model.predict(obs)
# here, action, rewards and dones are arrays
# because we are using vectorized env
obs, rewards, dones, info = self.env.step(action)
current_step += 1
if visualize:
self.env.render()
# Stats
episode_rewards[-1] += rewards[0]
if dones[0] or current_step > nb_max_episode_steps:
obs = self.env.reset()
print ("Episode ", current_episode, "reward: ", episode_rewards[-1])
episode_rewards.append(0.0)
current_episode += 1
current_step = 0
# Compute mean reward for the last 100 episodes
mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
print("Mean reward over last 100 episodes:", mean_100ep_reward)
def load_weights(self, file_name="test_weights.h5f"):
self.agent_model = PPO2.load(file_name)
def forward(self, observation):
return self.agent_model.predict(observation)
def set_environment(self, env):
self.env = DummyVecEnv([lambda: env])
self.agent_model.set_env(self.env)
from .policy_base import PolicyBase
class ControllerBase(PolicyBase):
"""Abstract class for controllers."""
def __init__(self, env, low_level_policies, start_node_alias):
self.env = env
self.low_level_policies = low_level_policies
# TODO: Move an intermediate class so that base class can be clean
self.current_node = self.low_level_policies[start_node_alias]
self.node_terminal_state_reached = False
self.controller_args_defaults = {}
def set_controller_args(self, **kwargs):
for (prop, default) in self.controller_args_defaults.items():
setattr(self, prop, kwargs.get(prop, default))
def can_transition(self):
"""Returns boolean signifying whether we can transition. To be
implemented in subclass.
"""
raise NotImplemented(self.__class__.__name__ + \
"can_transition is not implemented.")
def do_transition(self, observation):
"""Do a transition, assuming we can transition. To be
implemented in subclass.
Args:
observation: final observation from episodic step
"""
raise NotImplemented(self.__class__.__name__ + \
"do_transition is not implemented.")
def set_current_node(self, node_alias):
"""Sets the current node which is being executed
Args:
node: node alias of the node to be set
"""
raise NotImplemented(self.__class__.__name__ + \
"set_current_node is not implemented.")
# TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
''' Executes the current node until node termination condition is reached
Returns state at end of node execution, total reward, epsiode_termination_flag, info
'''
def step_current_node(self, visualize_low_level_steps=False):
total_reward = 0
self.node_terminal_state_reached = False
while not self.node_terminal_state_reached:
observation, reward, terminal, info = self.low_level_step_current_node()
if visualize_low_level_steps:
self.env.render()
total_reward += reward
total_reward += self.current_node.high_level_extra_reward
# TODO for info
return observation, total_reward, self.env.termination_condition, info
# TODO: Looks generic. Move to an intermediate class/highlevel manager so that base class can be clean
''' Executes one step of current node. Sets node_terminal_state_reached flag if node termination condition
has been reached.
Returns state after one step, step reward, episode_termination_flag, info
'''
def low_level_step_current_node(self):
u_ego = self.current_node.low_level_policy(self.current_node.get_reduced_features_tuple())
feature, R, terminal, info = self.current_node.step(u_ego)
self.node_terminal_state_reached = terminal
return self.env.get_features_tuple(), R, self.env.termination_condition, info
This diff is collapsed.
from.policy_base import PolicyBase
import numpy as np
class LearnerBase(PolicyBase):
"""The abstract class from which each learning policy backend is defined
and inherited."""
def __init__(self, input_shape=(10, ), nb_actions=2, **kwargs):
"""The constructor which sets the properties of the class.
Args:
input_shape: Shape of observation space, e.g (10,);
nb_actions: number of values in action space;
**kwargs: other optional key-value arguments with defaults defined in property_defaults
"""
self.input_shape = input_shape
self.nb_actions = nb_actions
property_defaults = {"lr": 0.001, "gamma": 0.99}
for (prop, default) in property_defaults.items():
setattr(self, prop, kwargs.get(prop, default))
def train(self,
env,
nb_steps=50000,
visualize=False,
nb_max_episode_steps=200):
"""Train the learning agent on the environment.
Args:
env: the environment instance. Should contain step() and reset() methods and optionally render()
nb_steps: the total number of steps to train
visualize: If True, visualizes the training. Works only if render() is present in env
nb_max_episode_steps: Maximum number of steps per episode
"""
return # do nothing unless specified in the subclass
def save_model(self, file_name, overwrite=True):
"""Save the weights of the agent. To be used after learning.
Args:
file_name: filename to be used when saving
overwrite: If True, overwrites existing file
"""
return # do nothing unless specified in the subclass
def load_model(self, file_name):
"""Load the weights of an agent.
Args:
file_name: filename to be used when loading
"""
return # do nothing unless specified in the subclass
def test_model(self,
env,
nb_episodes=5,
visualize=True,
nb_max_episode_steps=200):
"""Test the agent on the environment.
Args:
env: the environment instance. Should contain step(), reset() and optionally, render()
nb_episodes: Number of episodes to run
visualize: If True, visualizes the test. Works only if render() is present in env
nb_max_episode_steps: Maximum number of steps per episode
"""
return # do nothing unless specified in the subclass
def predict(self, observation):
"""Perform a forward pass and return next action by agent based on
current observation.
Args:
observation: the current observation. Shape should be same as self.input_shape
Returns: The action taken by agent depending on given observation
"""
return # do nothing unless specified in the subclass
from .controller_base import ControllerBase
class ManualPolicy(ControllerBase):
"""Manual policy execution using nodes and edges."""
def __init__(self, env, low_level_policies, transition_adj, start_node_alias):
"""Constructor for manual policy execution.
Args:
env: env instance
low_level_policies: low level policies dictionary
transition_adj: adjacent edges dictionary that contains transitions
start_node: starting node
"""
super(ManualPolicy, self).__init__(env, low_level_policies, start_node_alias)
self.adj = transition_adj
def _transition(self):
"""Check if the current node's termination condition is met and if
it is possible to transition to another node, i.e. its initiation
condition is met. This is an internal function.
Returns the new node if a transition can happen, None otherwise
"""
new_node = None
if self.low_level_policies[self.current_node].termination_condition:
for next_node in self.adj[self.current_node]:
if self.low_level_policies[next_node].initiation_condition:
new_node = next_node
break # change current_node to the highest priority next node
return new_node
def can_transition(self):
"""Check if we can transition.
Returns True if we can, false if we cannot.
"""
return self._transition() is not None
def do_transition(self, observation):
"""Do a singular transition using the specified edges.
Args:
observation: final observation from episodic step (not used)
"""
new_node = self._transition()
if new_node is not None:
self.current_node = new_node
\ No newline at end of file
This diff is collapsed.
from .controller_base import ControllerBase
from .mcts_learner import MCTSLearner
import tqdm
import numpy as np
class OnlineMCTSController(ControllerBase):
"""Online MCTS"""
def __init__(self, env, low_level_policies, start_node_alias):
"""Constructor for manual policy execution.
Args:
env: env instance
low_level_policies: low level policies dictionary
"""
super(OnlineMCTSController, self).__init__(env, low_level_policies, start_node_alias)
self.curr_node_alias = start_node_alias
self.controller_args_defaults = {
"predictor": None,
"max_depth": 5, # MCTS depth
"nb_traversals": 30, # MCTS traversals before decision
}
def set_current_node(self, node_alias):
self.current_node = self.low_level_policies[node_alias]
self.curr_node_alias = node_alias
self.current_node.reset_maneuver()
self.env.set_ego_info_text(node_alias)
def change_low_level_references(self, env_copy):
# Create a copy of the environment and change references in low level policies.
self.env = env_copy
for policy in self.low_level_policies.values():
policy.env = env_copy
def can_transition(self):
return not self.env.is_terminal()
def do_transition(self):
# Require a predictor function
if self.predictor is None:
raise Exception(self.__class__.__name__ + \
"predictor is not set. Use set_controller_args().")
# Store the env at this point
orig_env = self.env
np.random.seed()
# Change low level references before init MCTSLearner instance
env_before_mcts = orig_env.copy()
self.change_low_level_references(env_before_mcts)
print('Current Node: %s' % self.curr_node_alias)
mcts = MCTSLearner(self.env, self.low_level_policies, self.curr_node_alias)
mcts.max_depth = self.max_depth
mcts.set_controller_args(predictor = self.predictor)
# Do nb_traversals number of traversals, reset env to this point every time
# print('Doing MCTS with params: max_depth = %d, nb_traversals = %d' % (self.max_depth, self.nb_traversals))
for num_epoch in range(self.nb_traversals): # tqdm.tqdm(range(self.nb_traversals)):
mcts.curr_node_num = 0
env_begin_epoch = env_before_mcts.copy()
self.change_low_level_references(env_begin_epoch)
init_obs = self.env.get_features_tuple()
v, all_ep_R = mcts.traverse(init_obs)
self.change_low_level_references(orig_env)
# Find the nodes from the root node
mcts.curr_node_num = 0
print('%s' % mcts._to_discrete(self.env.get_features_tuple()))
node_after_transition = mcts.get_best_node(self.env.get_features_tuple(), use_ucb=False)
print('MCTS suggested next option: %s' % node_after_transition)
self.set_current_node(node_after_transition)
\ No newline at end of file
class PolicyBase:
"""Abstract policy base from which every policy backend is defined
and inherited."""
\ No newline at end of file
from .controller_base import ControllerBase
class RLController(ControllerBase):
"""RL controller using a trained policy."""
def __init__(self, env, low_level_policies, start_node_alias):
"""Constructor for manual policy execution.
Args:
env: env instance
low_level_policies: low level policies dictionary
"""
super(RLController, self).__init__(env, low_level_policies, start_node_alias)
self.low_level_policy_aliases = list(self.low_level_policies.keys())
self.trained_policy = None
self.node_terminal_state_reached = False
# TODO: move this to controller_base?
def set_current_node(self, node_alias):
self.current_node = self.low_level_policies[node_alias]
self.current_node.reset_maneuver()
self.env.set_ego_info_text(node_alias)
def set_trained_policy(self, policy):
self.trained_policy = policy
def can_transition(self):
return self.node_terminal_state_reached
def do_transition(self):
if self.trained_policy is None:
raise Exception(self.__class__.__name__ + \
"trained_policy is not set. Use set_trained_policy().")
node_index_after_transition = self.trained_policy(self.env.get_features_tuple())
self.set_current_node(self.low_level_policy_aliases[node_index_after_transition])
self.node_terminal_state_reached = False
\ No newline at end of file
{
"nodes": {
"wait": "Wait",
"follow": "Follow",
"stop": "Stop",
"changelane": "ChangeLane",
"keeplane": "KeepLane"
},
"edges": {
"keeplane": "keeplane"
},
"start_node": "keeplane",
"method": "rl"
}
\ No newline at end of file
sphinx/.build/index.html
\ No newline at end of file
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: aa9b388d303cad18faf5c75ada457b3d
tags: 645f666f9bcd5a90fca523b33c5a78b7
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>high_level_policy_main module &#8212; WiseMove documentation</title>
<link rel="stylesheet" href="../_static/haiku.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/fonts.css" type="text/css" />
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script type="text/javascript" src="../_static/jquery.js"></script>
<script type="text/javascript" src="../_static/underscore.js"></script>
<script type="text/javascript" src="../_static/doctools.js"></script>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
</head><body>
<div class="header" role="banner"><h1 class="heading"><a href="../index.html">
<span>WiseMove documentation</span></a></h1>
<h2 class="heading"><span>high_level_policy_main module</span></h2>
</div>
<div class="topnav" role="navigation" aria-label="top navigation">
<p>
<a class="uplink" href="../index.html">Contents</a>
</p>
</div>
<div class="content">
<div class="section" id="module-high_level_policy_main">
<span id="high-level-policy-main-module"></span><h1>high_level_policy_main module<a class="headerlink" href="#module-high_level_policy_main" title="Permalink to this headline"></a></h1>
<dl class="function">
<dt id="high_level_policy_main.evaluate_high_level_policy">
<code class="descclassname">high_level_policy_main.</code><code class="descname">evaluate_high_level_policy</code><span class="sig-paren">(</span><em>nb_episodes_for_test=100</em>, <em>nb_trials=10</em>, <em>trained_agent_file='highlevel_weights.h5f'</em>, <em>pretrained=False</em>, <em>visualize=False</em><span class="sig-paren">)</span><a class="headerlink" href="#high_level_policy_main.evaluate_high_level_policy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="function">
<dt id="high_level_policy_main.find_good_high_level_policy">
<code class="descclassname">high_level_policy_main.</code><code class="descname">find_good_high_level_policy</code><span class="sig-paren">(</span><em>nb_steps=25000</em>, <em>load_weights=False</em>, <em>nb_episodes_for_test=100</em>, <em>visualize=False</em>, <em>tensorboard=False</em>, <em>save_path='./highlevel_weights.h5f'</em><span class="sig-paren">)</span><a class="headerlink" href="#high_level_policy_main.find_good_high_level_policy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="function">
<dt id="high_level_policy_main.high_level_policy_testing">
<code class="descclassname">high_level_policy_main.</code><code class="descname">high_level_policy_testing</code><span class="sig-paren">(</span><em>nb_episodes_for_test=100</em>, <em>trained_agent_file='highlevel_weights.h5f'</em>, <em>pretrained=False</em>, <em>visualize=True</em><span class="sig-paren">)</span><a class="headerlink" href="#high_level_policy_main.high_level_policy_testing" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="function">
<dt id="high_level_policy_main.high_level_policy_training">
<code class="descclassname">high_level_policy_main.</code><code class="descname">high_level_policy_training</code><span class="sig-paren">(</span><em>nb_steps=25000</em>, <em>load_weights=False</em>, <em>training=True</em>, <em>testing=True</em>, <em>nb_episodes_for_test=10</em>, <em>max_nb_steps=100</em>, <em>visualize=False</em>, <em>tensorboard=False</em>, <em>save_path='highlevel_weights.h5f'</em><span class="sig-paren">)</span><a class="headerlink" href="#high_level_policy_main.high_level_policy_training" title="Permalink to this definition"></a></dt>
<dd><p>Do RL of the high-level policy and test it.
:param nb_steps: the number of steps to perform RL
:param load_weights: True if the pre-learned NN weights are loaded (for initializations of NNs)
:param training: True to enable training