Commit 80ce4036 by Steven Siyu Xiong

### Fix Elig Traces

parent 9970ed98
 ... ... @@ -7,7 +7,7 @@ import math import random class rlalgorithm: def __init__(self, actions, terminalState, deathStates, learning_rate=0.1, reward_decay=0.9, e_greedy=0.1, lamba=0.5): def __init__(self, actions, terminalState, deathStates, learning_rate=0.03, reward_decay=0.9, e_greedy=0.1, lamba=0.5): self.actions = actions # [0, 1, 2, 3] for [up down left right] self.lr = learning_rate ... ... @@ -45,16 +45,17 @@ class rlalgorithm: # Get the current cell of the policy grid actionValueCell = self.q_table.loc[observation,:] randomValue = np.random.uniform() randomValue = np.random.uniform() * 100 normalizedEpsilon = self.epsilon * 100 maxValue = actionValueCell.max() # We are choosing an action that is NOT part of the optimal policy set if (self.epsilon > randomValue): maxValue = actionValueCell.max() if (randomValue < normalizedEpsilon): # Determine which actions are not part of the maximum/optimal choice nonMaxIndices = [] for index, value in actionValueCell.items(): if (abs(maxValue - value) > self.precision): if (self.precision < abs(maxValue - value)): nonMaxIndices.append(index) # If all action-value pairs for this state are equal, then just choose randomly amoung 4 directions ... ... @@ -64,7 +65,7 @@ class rlalgorithm: if (nextAction > 3): nextAction = 3 else: incrementSize = self.epsilon / len(nonMaxIndices) incrementSize = normalizedEpsilon / len(nonMaxIndices) action_idx = math.floor(randomValue / incrementSize) if (action_idx >= len(nonMaxIndices)): ... ... @@ -73,20 +74,17 @@ class rlalgorithm: nextAction = nonMaxIndices[action_idx] else: maxValue = - sys.maxsize - 1 # Find best action and its index maxIndices = [] for index, value in actionValueCell.items(): if abs(maxValue - value) < self.precision: if (self.precision >= abs(maxValue - value)): maxIndices.append(index) elif maxValue < value: maxValue = value maxIndices = [index] # print("Max Index array size: ") # print(len(maxIndices)) # There are multiple max actions incrementSize = (1 - self.epsilon) / len(maxIndices) action_idx = math.floor((randomValue - self.epsilon) / incrementSize) incrementSize = (100 - normalizedEpsilon) / len(maxIndices) action_idx = math.floor((randomValue - normalizedEpsilon) / incrementSize) if (action_idx >= len(maxIndices)): print("Error: In the wrong place 2") ... ... @@ -113,13 +111,21 @@ class rlalgorithm: self.q_table.loc[coordState, act] = self.q_table.loc[coordState, act] + (self.lr * delta * self.e_traces.loc[coordState, act]) self.e_traces.loc[coordState, act] = self.gamma * self.lamba * self.e_traces.loc[coordState, act] # def reset_etraces(self): # for observation, row in self.e_traces.iterrows(): # self.e_traces.loc[observation] = pd.Series([0]*len(self.actions), # index=self.e_traces.columns, # name=observation, ) # '''States are dynamically added to the Q(S,A) table as they are encountered''' def check_state_exist(self, state): if state not in self.q_table.index: # append new state to q table self.q_table = self.q_table.append( pd.Series( [-30.00000]*len(self.actions), [-0.25]*len(self.actions), index=self.q_table.columns, name=state, ) ... ...
 ... ... @@ -102,7 +102,7 @@ def updateSarsa(env, RL, data, episodes=50): # AS the episode progresses while True: aheadState, rwd, done = env.step(action) # fresh env if(showRender or (episode % renderEveryNth)==0): env.render(sim_speed) ... ... @@ -180,6 +180,8 @@ def updateElibTrace(env, RL, data, episodes=50): data['global_reward']=global_reward for episode in range(episodes): RL.reset_etraces() t=0 # Moves the agent back to the initial spot on the grid if episode == 0: ... ... @@ -220,15 +222,42 @@ def updateElibTrace(env, RL, data, episodes=50): print('game over -- Algorithm {} completed'.format(RL.display_name)) env.destroy() # def updatePolicyGradient(env, RL, data, episodes=50): def updatePolicyGradient(env, RL, data, episodes=50): # # # policy = np.exp(theta * ) global_reward = np.zeros(episodes) data['global_reward']=global_reward # # end of game # print('game over -- Algorithm {} completed'.format(RL.display_name)) # env.destroy() for episode in range(episodes): t=0 # Moves the agent back to the initial spot on the grid if episode == 0: state = env.reset(value = 0) else: state = env.reset() debug(2,'state(ep:{},t:{})={}'.format(episode, t, state)) I = 1 while True: action = RL.choose_action(str(state)) aheadState, reward, done = env.step(action) RL.learn(state, action, reward, aheadState) I = RL.gamma * I state = aheadState if done: break else: t=t+1 debug(1, "({}) Episode {}: Length={} Total return = {} ".format(RL.display_name,episode, t, global_reward[episode]),printNow=(episode%printEveryNth==0)) # end of game print('game over -- Algorithm {} completed'.format(RL.display_name)) env.destroy() if __name__ == "__main__": ... ... @@ -285,16 +314,16 @@ if __name__ == "__main__": ######################### SARSA Algorithm ################################# # env2 = Maze(agentXY,goalXY,wall_shape,pits) # # Translate all grid corrdinates into canvas coordinates # deathStates = [] # for pitCoords in env2.pitblocks: # deathStates.append(str(env2.canvas.coords(pitCoords))) # RL2 = rlalg2(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates) # data2={} # env2.after(10, updateSarsa(env2, RL2, data2, episodes)) # env2.mainloop() # experiments.append((env2, RL2, data2)) env2 = Maze(agentXY,goalXY,wall_shape,pits) # Translate all grid corrdinates into canvas coordinates deathStates = [] for pitCoords in env2.pitblocks: deathStates.append(str(env2.canvas.coords(pitCoords))) RL2 = rlalg2(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates) data2={} env2.after(10, updateSarsa(env2, RL2, data2, episodes)) env2.mainloop() experiments.append((env2, RL2, data2)) ######################### SARSA Algorithm ################################# ... ... @@ -342,16 +371,16 @@ if __name__ == "__main__": ######################### Simple Q Learning ################################# env2 = Maze(agentXY,goalXY,wall_shape,pits) # Translate all grid corrdinates into canvas coordinates deathStates = [] for pitCoords in env2.pitblocks: deathStates.append(str(env2.canvas.coords(pitCoords))) RL2 = qlearn_simple(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates) data2={} env2.after(10, updateQLearn(env2, RL2, data2, episodes)) env2.mainloop() experiments.append((env2, RL2, data2)) # env2 = Maze(agentXY,goalXY,wall_shape,pits) # # Translate all grid corrdinates into canvas coordinates # deathStates = [] # for pitCoords in env2.pitblocks: # deathStates.append(str(env2.canvas.coords(pitCoords))) # RL2 = qlearn_simple(actions=list(range(env2.n_actions)), terminalState=str(env2.canvas.coords(env2.goal)), deathStates=deathStates) # data2={} # env2.after(10, updateQLearn(env2, RL2, data2, episodes)) # env2.mainloop() # experiments.append((env2, RL2, data2)) ######################### Simple Q Learning ################################# ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!