Commit b2a936a8 authored by Thomas Lim's avatar Thomas Lim
Browse files

Complete

parent d887657f
import numpy as np
import pandas as pd
from utils import epsilon_greedy
from utils import check_state_exist
class rlalgorithm:
def __init__(self, actions, *args, **kwargs):
"""Your code goes here"""
pass
def __init__(self, actions, learning_rate=0.1, reward_decay=0.9, e_greedy=0.1):
self.actions = actions
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table1 = pd.DataFrame(columns=self.actions, dtype=np.float64)
self.q_table2 = pd.DataFrame(columns=self.actions, dtype=np.float64)
self.display_name="Double Q Learning ({})".format(learning_rate)
def choose_action(self, observation):
"""Your code goes here"""
pass
# Should add this to both tables
self.q_table1 = check_state_exist(self.q_table1, observation, self.actions)
self.q_table2 = check_state_exist(self.q_table2, observation, self.actions)
return self.double_epsilon_greedy(observation, self.epsilon)
def learn(self, s, a, r, s_):
"""Your code goes here"""
pass
# Should add this to both tables
self.q_table1 = check_state_exist(self.q_table1, s_, self.actions)
self.q_table2 = check_state_exist(self.q_table2, s_, self.actions)
if s_ != 'terminal':
a = self.choose_action(str(s))
greedy_action1 = epsilon_greedy(self.q_table1, s_, 0, self.actions)
greedy_action2 = epsilon_greedy(self.q_table2, s_, 0, self.actions)
maxq1 = self.q_table2.loc[s, greedy_action1]
maxq2 = self.q_table1.loc[s, greedy_action2]
# 50% chance of training the other function
if np.random.uniform() > 0.5:
self.q_table1.loc[s, a] = self.q_table1.loc[s, a] + self.lr * (r + self.gamma * maxq1 - self.q_table1.loc[s,a])
else:
self.q_table2.loc[s, a] = self.q_table2.loc[s, a] + self.lr * (r + self.gamma * maxq2 - self.q_table2.loc[s,a])
else:
if np.random.uniform() > 0.5:
self.q_table1.loc[s,a] = r # next state is terminal
else:
self.q_table2.loc[s,a] = r # next state is terminal
return s_, a
# Code modified from RL_brainsample_hacky_PI.py
def double_epsilon_greedy(self, s, epsilon):
if np.random.uniform() >= epsilon:
maxq = float("-inf")
greedy_actions = []
for a in range(len(self.actions)):
q = self.q_table1.loc[s, a] + self.q_table2.loc[s, a]
if q < maxq:
continue
if q > maxq:
maxq = q
greedy_actions.clear()
greedy_actions.append(a)
action = np.random.choice(greedy_actions)
else:
action = np.random.choice(self.actions)
return action
import numpy as np
import pandas as pd
from utils import epsilon_greedy
from utils import check_state_exist
class rlalgorithm:
def __init__(self, actions, *args, **kwargs):
"""Your code goes here"""
pass
def __init__(self, actions, learning_rate=0.1, reward_decay=0.9, e_greedy=0.1):
self.actions = actions
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
self.display_name="Expected SARSA ({})".format(learning_rate)
def choose_action(self, observation):
"""Your code goes here"""
pass
self.q_table = check_state_exist(self.q_table, observation, self.actions)
return epsilon_greedy(self.q_table, observation, self.epsilon, self.actions)
# Code modified from RL_brainsample_hacky_PI.py
def learn(self, s, a, r, s_):
"""Your code goes here"""
pass
self.q_table = check_state_exist(self.q_table, s_, self.actions)
if s_ != 'terminal':
a_ = self.choose_action(str(s_)) # choose with epsilon greedy
expvalue = self.calc_expected_value(s_)
self.q_table.loc[s, a] = self.q_table.loc[s, a] + self.lr *(r + self.gamma * expvalue - self.q_table.loc[s, a])
else:
self.q_table.loc[s, a] = r # next state is terminal
return s_, a_
def calc_expected_value(self, s):
greedy_action = epsilon_greedy(self.q_table, s, 0, self.actions)
greedy_value = self.q_table.loc[s, greedy_action]
exp_tot = 0
state_action = self.q_table.loc[s, :]
ngreedy = len(state_action[state_action == np.max(state_action)])
num_actions = len(state_action)
for a in range(num_actions):
v = self.q_table.loc[s, a]
if v == greedy_value:
exp_tot += (1.0 - self.epsilon) / ngreedy * greedy_value
else:
exp_tot += self.epsilon / (num_actions - ngreedy) * v
return exp_tot
import numpy as np
import pandas as pd
from utils import check_state_exist
from utils import epsilon_greedy
class rlalgorithm:
def __init__(self, actions, *args, **kwargs):
"""Your code goes here"""
pass
def __init__(self, actions, learning_rate=0.1, reward_decay=0.9, e_greedy=0.1):
self.actions = actions
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
self.display_name="Q Learning({})".format(learning_rate)
def choose_action(self, observation):
"""Your code goes here"""
pass
self.q_table = check_state_exist(self.q_table, observation, self.actions)
return epsilon_greedy(self.q_table, observation, self.epsilon, self.actions)
# Code modified from RL_brainsample_hacky_PI.py
def learn(self, s, a, r, s_):
"""Your code goes here"""
pass
self.q_table = check_state_exist(self.q_table, s_, self.actions)
if s_ != 'terminal':
a = self.choose_action(str(s))
greedy_action = epsilon_greedy(self.q_table, s_, 0, self.actions)
maxq = self.q_table.loc[s, greedy_action]
self.q_table.loc[s, a] = self.q_table.loc[s, a] + self.lr * (r + self.gamma * maxq - self.q_table.loc[s,a])
else:
self.q_table.loc[s,a] = r # next state is terminal
return s_, a
......@@ -2,16 +2,30 @@
import numpy as np
import pandas as pd
from utils import epsilon_greedy
from utils import check_state_exist
class rlalgorithm:
def __init__(self, actions, *args, **kwargs):
"""Your code goes here"""
pass
def __init__(self, actions, learning_rate=0.1, reward_decay=0.9, e_greedy=0.1):
self.actions = actions
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
self.display_name="SARSA ({})".format(learning_rate)
def choose_action(self, observation):
"""Your code goes here"""
pass
self.q_table = check_state_exist(self.q_table, observation, self.actions)
return epsilon_greedy(self.q_table, observation, self.epsilon, self.actions)
# Code modified from RL_brainsample_hacky_PI.py
def learn(self, s, a, r, s_):
"""Your code goes here"""
pass
self.q_table = check_state_exist(self.q_table, s_, self.actions)
if s_ != 'terminal':
a_ = self.choose_action(str(s_)) # choose with epsilon greedy
self.q_table.loc[s, a] = self.q_table.loc[s, a] + self.lr *(r + self.gamma * self.q_table.loc[s_, a_] - self.q_table.loc[s, a])
else:
self.q_table.loc[s, a]= r # next state is terminal
return s_, a_
from maze_env import Maze
from RL_brainsample_hacky_PI import rlalgorithm as rlalg1
from RL_brainsample_sarsa import rlalgorithm as rlalg2
from RL_brainsample_qlearning import rlalgorithm as rlalg3
from RL_brainsample_expsarsa import rlalgorithm as rlalg4
from RL_brainsample_doubqlearning import rlalgorithm as rlalg5
import numpy as np
import sys
import matplotlib.pyplot as plt
......@@ -77,13 +82,13 @@ def update(env, RL, data, episodes=50):
env.destroy()
if __name__ == "__main__":
sim_speed = 0.05
sim_speed = 0.001#0.05
#Example Short Fast for Debugging
showRender=True
episodes=30
renderEveryNth=5
printEveryNth=1
episodes=50
renderEveryNth=1000#5
printEveryNth=10
do_plot_rewards=True
#Example Full Run, you may need to run longer
......@@ -127,20 +132,33 @@ if __name__ == "__main__":
env1 = Maze(agentXY,goalXY,wall_shape, pits)
RL1 = rlalg1(actions=list(range(env1.n_actions)))
RL1 = rlalg3(actions=list(range(env1.n_actions)), learning_rate=0.02)
data1={}
env1.after(10, update(env1, RL1, data1, episodes))
env1.mainloop()
experiments = [(env1,RL1, data1)]
#Create another RL_brain_ALGNAME.py class and import it as rlag2 then run it here.
#env2 = Maze(agentXY,goalXY,wall_shape,pits)
#RL2 = rlalg2(actions=list(range(env2.n_actions)))
#data2={}
#env2.after(10, update(env2, RL2, data2, episodes))
#env2.mainloop()
#experiments.append((env2,RL2, data2))
env2 = Maze(agentXY,goalXY,wall_shape,pits)
RL2 = rlalg3(actions=list(range(env2.n_actions)),learning_rate=0.05)
data2={}
env2.after(10, update(env2, RL2, data2, episodes))
env2.mainloop()
experiments.append((env2,RL2, data2))
env3 = Maze(agentXY,goalXY,wall_shape,pits)
RL3 = rlalg3(actions=list(range(env3.n_actions)),learning_rate=0.08)
data3={}
env3.after(10, update(env3, RL3, data2, episodes))
env3.mainloop()
experiments.append((env3, RL3, data2))
env4 = Maze(agentXY,goalXY,wall_shape,pits)
RL4 = rlalg3(actions=list(range(env4.n_actions)),learning_rate=0.1)
data2={}
env4.after(10, update(env4, RL4, data2, episodes))
env4.mainloop()
experiments.append((env4,RL4, data2))
print("All experiments complete")
......
import numpy as np
import pandas as pd
# Code modified from RL_brainsample_hacky_PI.py
def epsilon_greedy(q_table, s, epsilon, actions):
if np.random.uniform() >= epsilon:
state_action = q_table.loc[s, :]
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
action = np.random.choice(actions)
return action
# Code modified from RL_brainsample_hacky_PI.py
'''States are dynamically added to the Q(S,A) table as they are encountered'''
def check_state_exist(q_table, state, actions):
if state not in q_table.index:
# append new state to q table
q_table = q_table.append(
pd.Series(
[0]*len(actions),
index=q_table.columns,
name=state,
)
)
return q_table
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment