Commit 80ce4036 authored by Steven Siyu Xiong's avatar Steven Siyu Xiong

Fix Elig Traces

parent 9970ed98
......@@ -7,7 +7,7 @@ import math
import random
class rlalgorithm:
def __init__(self, actions, terminalState, deathStates, learning_rate=0.1, reward_decay=0.9, e_greedy=0.1, lamba=0.5):
def __init__(self, actions, terminalState, deathStates, learning_rate=0.03, reward_decay=0.9, e_greedy=0.1, lamba=0.5):
self.actions = actions # [0, 1, 2, 3] for [up down left right]
self.lr = learning_rate
......@@ -45,16 +45,17 @@ class rlalgorithm:
# Get the current cell of the policy grid
actionValueCell = self.q_table.loc[observation,:]
randomValue = np.random.uniform()
randomValue = np.random.uniform() * 100
normalizedEpsilon = self.epsilon * 100
maxValue = actionValueCell.max()
# We are choosing an action that is NOT part of the optimal policy set
if (self.epsilon > randomValue):
maxValue = actionValueCell.max()
if (randomValue < normalizedEpsilon):
# Determine which actions are not part of the maximum/optimal choice
nonMaxIndices = []
for index, value in actionValueCell.items():
if (abs(maxValue - value) > self.precision):
if (self.precision < abs(maxValue - value)):
nonMaxIndices.append(index)
# If all action-value pairs for this state are equal, then just choose randomly amoung 4 directions
......@@ -64,7 +65,7 @@ class rlalgorithm:
if (nextAction > 3):
nextAction = 3
else:
incrementSize = self.epsilon / len(nonMaxIndices)
incrementSize = normalizedEpsilon / len(nonMaxIndices)
action_idx = math.floor(randomValue / incrementSize)
if (action_idx >= len(nonMaxIndices)):
......@@ -73,20 +74,17 @@ class rlalgorithm:
nextAction = nonMaxIndices[action_idx]
else:
maxValue = - sys.maxsize - 1
# Find best action and its index
maxIndices = []
for index, value in actionValueCell.items():
if abs(maxValue - value) < self.precision:
if (self.precision >= abs(maxValue - value)):
maxIndices.append(index)
elif maxValue < value:
maxValue = value
maxIndices = [index]
# print("Max Index array size: ")
# print(len(maxIndices))
# There are multiple max actions
incrementSize = (1 - self.epsilon) / len(maxIndices)
action_idx = math.floor((randomValue - self.epsilon) / incrementSize)
incrementSize = (100 - normalizedEpsilon) / len(maxIndices)
action_idx = math.floor((randomValue - normalizedEpsilon) / incrementSize)
if (action_idx >= len(maxIndices)):
print("Error: In the wrong place 2")
......@@ -113,13 +111,21 @@ class rlalgorithm:
self.q_table.loc[coordState, act] = self.q_table.loc[coordState, act] + (self.lr * delta * self.e_traces.loc[coordState, act])
self.e_traces.loc[coordState, act] = self.gamma * self.lamba * self.e_traces.loc[coordState, act]
# def reset_etraces(self):
# for observation, row in self.e_traces.iterrows():
# self.e_traces.loc[observation] = pd.Series([0]*len(self.actions),
# index=self.e_traces.columns,
# name=observation,
)
# '''States are dynamically added to the Q(S,A) table as they are encountered'''
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[-30.00000]*len(self.actions),
[-0.25]*len(self.actions),
index=self.q_table.columns,
name=state,
)
......
......@@ -102,7 +102,7 @@ def updateSarsa(env, RL, data, episodes=50):
# AS the episode progresses
while True:
aheadState, rwd, done = env.step(action)
# fresh env
if(showRender or (episode % renderEveryNth)==0):
env.render(sim_speed)
......@@ -180,6 +180,8 @@ def updateElibTrace(env, RL, data, episodes=50):
data['global_reward']=global_reward
for episode in range(episodes):
RL.reset_etraces()
t=0
# Moves the agent back to the initial spot on the grid
if episode == 0:
......@@ -220,15 +222,42 @@ def updateElibTrace(env, RL, data, episodes=50):
print('game over -- Algorithm {} completed'.format(RL.display_name))
env.destroy()
# def updatePolicyGradient(env, RL, data, episodes=50):
def updatePolicyGradient(env, RL, data, episodes=50):
# #
# policy = np.exp(theta * )
global_reward = np.zeros(episodes)
data['global_reward']=global_reward
# # end of game
# print('game over -- Algorithm {} completed'.format(RL.display_name))
# env.destroy()
for episode in range(episodes):
t=0
# Moves the agent back to the initial spot on the grid
if episode == 0:
state = env.reset(value = 0)
else:
state = env.reset()
debug(2,'state(ep:{},t:{})={}'.format(episode, t, state))
I = 1
while True:
action = RL.choose_action(str(state))
aheadState, reward, done = env.step(action)
RL.learn(state, action, reward, aheadState)
I = RL.gamma * I
state = aheadState
if done:
break
else:
t=t+1
debug(1, "({}) Episode {}: Length={} Total return = {} ".format(RL.display_name,episode, t, global_reward[episode]),printNow=(episode%printEveryNth==0))
# end of game
print('game over -- Algorithm {} completed'.format(RL.display_name))
env.destroy()
if __name__ == "__main__":
......@@ -285,16 +314,16 @@ if __name__ == "__main__":
######################### SARSA Algorithm #################################
# env2 = Maze(agentXY,goalXY,wall_shape,pits)
# # Translate all grid corrdinates into canvas coordinates
# deathStates = []
# for pitCoords in env2.pitblocks:
# deathStates.append(str(env2.canvas.coords(pitCoords)))
# RL2 = rlalg2(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates)
# data2={}
# env2.after(10, updateSarsa(env2, RL2, data2, episodes))
# env2.mainloop()
# experiments.append((env2, RL2, data2))
env2 = Maze(agentXY,goalXY,wall_shape,pits)
# Translate all grid corrdinates into canvas coordinates
deathStates = []
for pitCoords in env2.pitblocks:
deathStates.append(str(env2.canvas.coords(pitCoords)))
RL2 = rlalg2(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates)
data2={}
env2.after(10, updateSarsa(env2, RL2, data2, episodes))
env2.mainloop()
experiments.append((env2, RL2, data2))
######################### SARSA Algorithm #################################
......@@ -342,16 +371,16 @@ if __name__ == "__main__":
######################### Simple Q Learning #################################
env2 = Maze(agentXY,goalXY,wall_shape,pits)
# Translate all grid corrdinates into canvas coordinates
deathStates = []
for pitCoords in env2.pitblocks:
deathStates.append(str(env2.canvas.coords(pitCoords)))
RL2 = qlearn_simple(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates)
data2={}
env2.after(10, updateQLearn(env2, RL2, data2, episodes))
env2.mainloop()
experiments.append((env2, RL2, data2))
# env2 = Maze(agentXY,goalXY,wall_shape,pits)
# # Translate all grid corrdinates into canvas coordinates
# deathStates = []
# for pitCoords in env2.pitblocks:
# deathStates.append(str(env2.canvas.coords(pitCoords)))
# RL2 = qlearn_simple(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates)
# data2={}
# env2.after(10, updateQLearn(env2, RL2, data2, episodes))
# env2.mainloop()
# experiments.append((env2, RL2, data2))
######################### Simple Q Learning #################################
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment