diff --git a/README.txt b/README.md similarity index 55% rename from README.txt rename to README.md index a82c36075ccc6d2b00396adc9422f44f15799aba..8afb6f2360eb26016fda4170a535895557272014 100644 --- a/README.txt +++ b/README.md @@ -29,14 +29,24 @@ These are the minimum steps required to replicate the results for simple_interse * Run `./scripts/install_dependencies.sh` to install python dependencies. * Low-level policies: - * To train all low-level policies from scratch: `python3 low_level_policy_main.py --train` - * To train a single low-level, for example wait: `python3 low_level_policy_main.py --option=wait --train` - * To test these trained low-level policies: `python3 low_level_policy_main.py --test --saved_policy_in_root` - * To test one of these trained low-level policies, for example wait: `python3 low_level_policy_main.py --option=wait --test --saved_policy_in_root` + * You can choose to train and test all the maneuvers. But this may take some time and is not recommended. + * To train all low-level policies from scratch: `python3 low_level_policy_main.py --train`. This may take some time. + * To test all these trained low-level policies: `python3 low_level_policy_main.py --test --saved_policy_in_root`. + * Make sure the training is fully complete before running above test. + * It is easier to verify few of the maneuvers using below commands: + * To train a single low-level, for example wait: `python3 low_level_policy_main.py --option=wait --train`. + * To test one of these trained low-level policies, for example wait: `python3 low_level_policy_main.py --option=wait --test --saved_policy_in_root` + * Available maneuvers are: wait, changelane, stop, keeplane, follow + * These results are visually evaluated. + * Note: This training has a high variance issue due to the continuous action space, especially for stop and keeplane maneuvers. It may help to train for 0.2 million steps than the default 0.1 million by adding argument '--nb_steps=200000' while training. * High-level policy: * To train high-level policy from scratch using the given low-level policies: `python3 high_level_policy_main.py --train` - * To evaluate this trained high-level policy: `python3 high_level_policy_main.py --evaluate --saved_policy_in_root` -* To run MCTS using the high-level policy: `python3 mcts.py` + * To evaluate this trained high-level policy: `python3 high_level_policy_main.py --evaluate --saved_policy_in_root`. + * The success average and standard deviation corresponds to the result from high-level policy experiments. +* To run MCTS using the high-level policy: + * To obtain a probabilites tree and save it: `python3 mcts.py --train` + * To evaluate using this saved tree: `python3 mcts.py --evaluate --saved_policy_in_root`. + * The success average and standard deviation corresponds to the results from MCTS experiments. Coding Standards ================ @@ -49,4 +59,4 @@ have modified. 1. `yapf -i YOUR_MODIFIED_FILE.py` 2. `docformatter --in-place YOUR_MODIFIED_FILE.py` -`yapf` formats the code and `docformatter` formats the docstrings. \ No newline at end of file +`yapf` formats the code and `docformatter` formats the docstrings. diff --git a/env/simple_intersection/simple_intersection_env.py b/env/simple_intersection/simple_intersection_env.py index 2cf5edf09893ad20832a572a1935f8680ca6f016..c74a1612d1625609a4f850bd44724e888ee99286 100644 --- a/env/simple_intersection/simple_intersection_env.py +++ b/env/simple_intersection/simple_intersection_env.py @@ -1213,3 +1213,12 @@ class SimpleIntersectionEnv(RoadEnv, EpisodicEnvBase): self.window = backup_window return new_env + + @property + def n_others_with_higher_priority(self): + n_others_with_higher_priority = 0 + for veh in self.vehs[1:]: + if veh.waited_count > self.ego.waited_count: + n_others_with_higher_priority += 1 + + return n_others_with_higher_priority diff --git a/high_level_policy_main.py b/high_level_policy_main.py index a9b4108add49ea04d9c05dd76a6a995f273d5130..bf1d542c6ce9e3387ac0019204e2e2c2c19f071e 100644 --- a/high_level_policy_main.py +++ b/high_level_policy_main.py @@ -155,7 +155,7 @@ if __name__ == "__main__": help="Number of steps to train for. Default is 25000", default=25000, type=int) parser.add_argument( "--nb_episodes_for_test", - help="Number of episodes to test/evaluate. Default is 20", default=20, type=int) + help="Number of episodes to test/evaluate. Default is 100", default=100, type=int) parser.add_argument( "--nb_trials", help="Number of trials to evaluate. Default is 10", default=10, type=int) @@ -172,7 +172,7 @@ if __name__ == "__main__": if args.train: high_level_policy_training(nb_steps=args.nb_steps, load_weights=args.load_weights, save_path=args.save_file, - tensorboard=args.tensorboard, nb_episodes_for_test=args.nb_episodes_for_test, + tensorboard=args.tensorboard, nb_episodes_for_test=20, visualize=args.visualize) if args.test: diff --git a/low_level_policy_main.py b/low_level_policy_main.py index 291ea825953c3c24f870339b4e95c462b2c008a9..d4a129a55dc5585b32b1087b2f6af4a70bf11e28 100644 --- a/low_level_policy_main.py +++ b/low_level_policy_main.py @@ -102,11 +102,11 @@ if __name__ == "__main__": help="the option to train. Eg. stop, keeplane, wait, changelane, follow. If not defined, trains all options") parser.add_argument( "--test", - help="Test a saved high level policy. Uses backends/trained_policies/highlevel/highlevel_weights.h5f by default", + help="Test a saved high level policy. Uses saved policy in backends/trained_policies/OPTION_NAME/ by default", action="store_true") parser.add_argument( "--saved_policy_in_root", - help="Use saved policies in root of project rather than backends/trained_policies/highlevel/", + help="Use saved policies in root of project rather than backends/trained_policies/OPTION_NAME", action="store_true") parser.add_argument( "--load_weights", @@ -141,15 +141,18 @@ if __name__ == "__main__": tensorboard=args.tensorboard) else: for option_key in options.maneuvers.keys(): + print("Training {} maneuver...".format(option_key)) low_level_policy_training(option_key, load_weights=args.load_weights, nb_steps=args.nb_steps, nb_episodes_for_test=args.nb_episodes_for_test, visualize=args.visualize, tensorboard=args.tensorboard) if args.test: if args.option: + print("Testing {} maneuver...".format(args.option)) low_level_policy_testing(args.option, pretrained=not args.saved_policy_in_root, nb_episodes_for_test=args.nb_episodes_for_test) else: for option_key in options.maneuvers.keys(): + print("Testing {} maneuver...".format(option_key)) low_level_policy_testing(args.option, pretrained=not args.saved_policy_in_root, nb_episodes_for_test=args.nb_episodes_for_test) \ No newline at end of file diff --git a/mcts.py b/mcts.py index 95ef93fdb14e9188339f6b2b306dc5891a86ddd0..9bd2abbf95bab52d4012e9718856ed09a3e7ddd2 100644 --- a/mcts.py +++ b/mcts.py @@ -1,11 +1,9 @@ from env.simple_intersection import SimpleIntersectionEnv -from env.simple_intersection.constants import * from options.options_loader import OptionsGraph from backends import DDPGLearner, DQNLearner, MCTSLearner -import pickle -import tqdm import numpy as np import tqdm +import argparse import sys @@ -28,7 +26,7 @@ class Logger(object): sys.stdout = Logger() # TODO: make a separate file for this function. -def mcts_training(nb_traversals, save_every=20, visualize=False): +def mcts_training(nb_traversals, save_every=20, visualize=False, load_saved=False, save_file="mcts.pickle"): """ Do RL of the low-level policy of the given maneuver and test it. Args: @@ -50,7 +48,9 @@ def mcts_training(nb_traversals, save_every=20, visualize=False): agent.load_model("backends/trained_policies/highlevel/highlevel_weights.h5f") options.set_controller_args(predictor = agent.get_softq_value_using_option_alias) options.controller.max_depth = 20 - #options.controller.load_model('backends/trained_policies/mcts/mcts.pickle') + + if load_saved: + options.controller.load_model(save_file) total_epochs = nb_traversals//save_every trav_num = 1 @@ -62,17 +62,16 @@ def mcts_training(nb_traversals, save_every=20, visualize=False): options.controller.curr_node_num = 0 init_obs = options.reset() v, all_ep_R = options.controller.traverse(init_obs, visualize=visualize) - # print('Traversal %d: V = %f' % (num_traversal, v)) - # print('Overall Reward: %f\n' % all_ep_R) + last_rewards += [all_ep_R] trav_num += 1 - options.controller.save_model('mcts_%d.pickle' % (num_epoch)) + options.controller.save_model(save_file) success = lambda x: x > 50 success_rate = np.sum(list(map(success, last_rewards)))/(len(last_rewards)*1.0) print('success rate: %f' % success_rate) print('Average Reward (%d-%d): %f\n' % (beg_trav_num, trav_num-1, np.mean(last_rewards))) -def mcts_evaluation(nb_traversals, num_trials=5, visualize=False): +def mcts_evaluation(nb_traversals, num_trials=5, visualize=False, save_file="mcts.pickle", pretrained=False): """ Do RL of the low-level policy of the given maneuver and test it. Args: @@ -95,11 +94,14 @@ def mcts_evaluation(nb_traversals, num_trials=5, visualize=False): options.set_controller_args(predictor=agent.get_softq_value_using_option_alias) options.controller.max_depth = 20 + if pretrained: + save_file = "backends/trained_policies/mcts/" + save_file + success_list = [] print('Total number of trials = %d' % num_trials) for trial in range(num_trials): num_successes = 0 - options.controller.load_model('backends/trained_policies/mcts/mcts.pickle') + options.controller.load_model(save_file) for num_traversal in tqdm.tqdm(range(nb_traversals)): options.controller.curr_node_num = 0 init_obs = options.reset() @@ -222,20 +224,51 @@ def evaluate_online_mcts(nb_episodes=20, nb_trials=5): np.mean(count_list), np.std(count_list))) -def mcts_visualize(file_name): - with open(file_name, 'rb') as handle: - to_restore = pickle.load(handle) - # TR = to_restore['TR'] - # M = to_restore['M'] - # for key, val in TR.items(): - # print('%s: %f, count = %d' % (key, val/M[key], M[key])) - print(len(to_restore['nodes'])) - if __name__ == "__main__": - - mcts_training(nb_traversals=10000, save_every=1000, visualize=False) - # mcts_evaluation(nb_traversals=100, num_trials=10, visualize=False) - # for num in range(100): mcts_visualize('timeout_inf_save100/mcts_%d.pickle' % num) - # mcts_visualize('mcts.pickle') - #online_mcts(10) - # evaluate_online_mcts(nb_episodes=20,nb_trials=5) + parser = argparse.ArgumentParser() + parser.add_argument( + "--train", + help="Train an offline mcts with default settings. Always saved in root folder.", + action="store_true") + parser.add_argument( + "--evaluate", + help="Evaluate over n trials. " + "Uses backends/trained_policies/mcts/mcts.pickle by default", + action="store_true") + parser.add_argument( + "--saved_policy_in_root", + help="Use saved policies in root of project rather than backends/trained_policies/mcts/", + action="store_true") + parser.add_argument( + "--load_saved", + help="Load a saved policy from root folder first before training", + action="store_true") + parser.add_argument( + "--visualize", + help="Visualize the training. Testing is always visualized. Evaluation is not visualized by default", + action="store_true") + parser.add_argument( + "--nb_traversals", + help="Number of traversals to perform. Default is 1000", default=1000, type=int) + parser.add_argument( + "--save_every", + help="Saves every n traversals. Saves in root by default. Default is 500", default=500, type=int) + parser.add_argument( + "--nb_traversals_for_test", + help="Number of episodes to evaluate. Default is 100", default=100, type=int) + parser.add_argument( + "--nb_trials", + help="Number of trials to evaluate. Default is 10", default=10, type=int) + parser.add_argument( + "--save_file", + help="filename to save/load the trained policy. Location is as specified by --saved_policy_in_root. Default name is mcts.pickle", + default="mcts.pickle") + + args = parser.parse_args() + + if args.train: + mcts_training(nb_traversals=args.nb_traversals, save_every=args.save_every, visualize=args.visualize, + load_saved=args.load_saved, save_file=args.save_file) + if args.evaluate: + mcts_evaluation(nb_traversals=args.nb_traversals_for_test, num_trials=args.nb_trials, visualize=args.visualize, + pretrained=not args.saved_policy_in_root, save_file=args.save_file) \ No newline at end of file diff --git a/options/options_loader.py b/options/options_loader.py index 6a40affcb28b28cd3a04e4094e8bbdcb8b2642e7..0625bf34514b273f79af726b32d2adc1642fe7bc 100644 --- a/options/options_loader.py +++ b/options/options_loader.py @@ -152,6 +152,8 @@ class OptionsGraph: lr=1e-3) agent.load_model("backends/trained_policies/" + key + "/" + key + "_weights.h5f") maneuver.set_low_level_trained_policy(agent.predict) + maneuver._cost_weights = (20.0 * 1e-3, 1.0 * 1e-3, 0.25 * 1e-3, 1.0 * 1e-3, + 100.0 * 1e-3, 0.1 * 1e-3, 0.25 * 1e-3, 0.1 * 1e-3) if self.config["method"] == "mcts": maneuver.timeout = np.inf diff --git a/options/simple_intersection/maneuver_base.py b/options/simple_intersection/maneuver_base.py index ebed07ca94f090af30d33438fe2d384dd3df244b..4b47f956777f8475504aa8b99806be4653ee24a7 100644 --- a/options/simple_intersection/maneuver_base.py +++ b/options/simple_intersection/maneuver_base.py @@ -273,9 +273,9 @@ class ManeuverBase(EpisodicEnvBase): def generate_learning_scenario(self): raise NotImplemented(self.__class__.__name__ + ".generate_learning_scenario is not implemented.") - def generate_validation_scenario(self): - # If not implemented, use learning scenario. + def generate_validation_scenario(self): # Override this method in the subclass if some customization is needed. self.generate_learning_scenario() + self._enable_low_level_training_properties = False def generate_scenario(self, enable_LTL_preconditions=True, timeout=np.infty, **kwargs): """generates the scenario for low-level policy learning and validation. This method diff --git a/options/simple_intersection/maneuvers.py b/options/simple_intersection/maneuvers.py index bb6a7649a80313b0900291da7f23a1dc6f35fb39..ab802ffc700088442189c8f7743ef5409fcc3051 100644 --- a/options/simple_intersection/maneuvers.py +++ b/options/simple_intersection/maneuvers.py @@ -103,7 +103,7 @@ class Wait(ManeuverBase): def _init_LTL_preconditions(self): self._LTL_preconditions.append( - LTLProperty("G ( (in_stop_region and stopped_now) U highest_priority)", 0)) + LTLProperty("G ( (in_stop_region and stopped_now) U highest_priority )", 0)) self._LTL_preconditions.append( LTLProperty("G ( not (in_intersection and highest_priority) )", @@ -113,15 +113,14 @@ class Wait(ManeuverBase): ego = self.env.ego self._v_ref = rd.speed_limit if self.env.ego.APs['highest_priority'] else 0 self._target_lane = ego.APs['lane'] - - self._n_others_with_higher_priority = 0 - for veh in self.env.vehs[1:]: - if veh.waited_count > ego.waited_count: - self._n_others_with_higher_priority += 1 + self._ego_stop_count = 0 def _update_param(self): if self.env.ego.APs['highest_priority']: self._v_ref = rd.speed_limit + if self._enable_low_level_training_properties: + if self.env.n_others_with_higher_priority == 0: + self._ego_stop_count += 1 def generate_learning_scenario(self): n_others = np.random.randint(0, 3) @@ -141,6 +140,19 @@ class Wait(ManeuverBase): self.env.init_APs(False) self._reward_in_goal = 200 + self._extra_r_on_timeout = -200 + self._enable_low_level_training_properties = True + self._ego_stop_count = 0 + + @property + def extra_termination_condition(self): + if self._enable_low_level_training_properties: # activated only for the low-level training. + if self._ego_stop_count >= 50: + self._extra_r_terminal = -200 + return True + else: + self._extra_r_terminal = None + return False @staticmethod def _features_dim_reduction(features_tuple):