sunhangqi
diff --git a/‎Reinforcement_learning_TUT/2_Q_Learning_maze/RL_brain.py
+1-1 b/‎Reinforcement_learning_TUT/2_Q_Learning_maze/RL_brain.py
+1-1
diff --git a/‎Reinforcement_learning_TUT/2_Q_Learning_maze/run_this.py
+2-2 b/‎Reinforcement_learning_TUT/2_Q_Learning_maze/run_this.py
+2-2
diff --git a/‎Reinforcement_learning_TUT/3_Sarsa_maze/RL_brain.py
+12-40 b/‎Reinforcement_learning_TUT/3_Sarsa_maze/RL_brain.py
+12-40
diff --git a/‎Reinforcement_learning_TUT/4_Sarsa_lambda_maze/RL_brain.py
+9-66 b/‎Reinforcement_learning_TUT/4_Sarsa_lambda_maze/RL_brain.py
+9-66
diff --git a/‎Reinforcement_learning_TUT/4_Sarsa_lambda_maze/run_this.py
-3 b/‎Reinforcement_learning_TUT/4_Sarsa_lambda_maze/run_this.py
-3
@@ -9,7 +9,7 @@
 import pandas as pd
 
 
-class QTable:
+class QLearningTable:
     def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
         self.actions = actions  # a list
         self.lr = learning_rate
 
@@ -13,7 +13,7 @@
 """
 
 from maze_env import Maze
-from RL_brain import QTable
+from RL_brain import QLearningTable
 
 
 def update():
@@ -47,7 +47,7 @@ def update():
 
 if __name__ == "__main__":
     env = Maze()
-    RL = QTable(actions=list(range(env.n_actions)))
+    RL = QLearningTable(actions=list(range(env.n_actions)))
 
     env.after(100, update)
     env.mainloop()
@@ -8,25 +8,14 @@
 import numpy as np
 import pandas as pd
 
+
 class RL(object):
     def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
         self.actions = action_space  # a list
         self.lr = learning_rate
         self.gamma = reward_decay
         self.epsilon = e_greedy
 
-    def choose_action(self, observation):
-        pass
-
-    def learn(self, *args):
-        pass
-
-
-# can be learned offline
-class QTable(RL):
-    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
-        super(QTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
-
         self.q_table = pd.DataFrame(columns=self.actions)
 
     def check_state_exist(self, state):
@@ -43,7 +32,7 @@ def check_state_exist(self, state):
     def choose_action(self, observation):
         self.check_state_exist(observation)
         # action selection
-        if np.random.uniform() < self.epsilon:
+        if np.random.rand() < self.epsilon:
             # choose best action
             state_action = self.q_table.ix[observation, :]
             state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
@@ -53,6 +42,15 @@ def choose_action(self, observation):
             action = np.random.choice(self.actions)
         return action
 
+    def learn(self, *args):
+        pass
+
+
+# off-policy
+class QLearningTable(RL):
+    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
     def learn(self, s, a, r, s_):
         self.check_state_exist(s_)
         q_predict = self.q_table.ix[s, a]
@@ -63,38 +61,12 @@ def learn(self, s, a, r, s_):
         self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
 
 
-# online learning
+# on-policy
 class SarsaTable(RL):
 
     def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
         super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
 
-        self.q_table = pd.DataFrame(columns=self.actions)
-
-    def check_state_exist(self, state):
-        if state not in self.q_table.index:
-            # append new state to q table
-            self.q_table = self.q_table.append(
-                pd.Series(
-                    [0]*len(self.actions),
-                    index=self.q_table.columns,
-                    name=state,
-                )
-            )
-
-    def choose_action(self, observation):
-        self.check_state_exist(observation)
-        # action selection
-        if np.random.rand() < self.epsilon:
-            # choose best action
-            state_action = self.q_table.ix[observation, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
-            action = state_action.argmax()
-        else:
-            # choose random action
-            action = np.random.choice(self.actions)
-        return action
-
     def learn(self, s, a, r, s_, a_):
         self.check_state_exist(s_)
         q_predict = self.q_table.ix[s, a]
 
@@ -16,60 +16,6 @@ def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=
         self.gamma = reward_decay
         self.epsilon = e_greedy
 
-    def choose_action(self, observation):
-        pass
-
-    def learn(self, *args):
-        pass
-
-
-# off-policy
-class QTable(RL):
-    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
-        super(QTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
-
-        self.q_table = pd.DataFrame(columns=self.actions)
-
-    def check_state_exist(self, state):
-        if state not in self.q_table.index:
-            # append new state to q table
-            self.q_table = self.q_table.append(
-                pd.Series(
-                    [0]*len(self.actions),
-                    index=self.q_table.columns,
-                    name=state,
-                )
-            )
-
-    def choose_action(self, observation):
-        self.check_state_exist(observation)
-        # action selection
-        if np.random.uniform() < self.epsilon:
-            # choose best action
-            state_action = self.q_table.ix[observation, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
-            action = state_action.argmax()
-        else:
-            # choose random action
-            action = np.random.choice(self.actions)
-        return action
-
-    def learn(self, s, a, r, s_):
-        self.check_state_exist(s_)
-        q_predict = self.q_table.ix[s, a]
-        if s_ != 'terminal':
-            q_target = r + self.gamma * self.q_table.ix[s_, :].max()  # next state is not terminal
-        else:
-            q_target = r  # next state is terminal
-        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
-
-
-# on-policy
-class SarsaTable(RL):
-
-    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
-        super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
-
         self.q_table = pd.DataFrame(columns=self.actions)
 
     def check_state_exist(self, state):
@@ -96,26 +42,18 @@ def choose_action(self, observation):
             action = np.random.choice(self.actions)
         return action
 
-    def learn(self, s, a, r, s_, a_):
-        self.check_state_exist(s_)
-        q_predict = self.q_table.ix[s, a]
-        if s_ != 'terminal':
-            q_target = r + self.gamma * self.q_table.ix[s_, a_]  # next state is not terminal
-        else:
-            q_target = r  # next state is terminal
-        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
+    def learn(self, *args):
+        pass
 
 
 # backward eligibility traces
-class SarsaLambdaTable(SarsaTable):
+class SarsaLambdaTable(RL):
     def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
         super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
 
         # backward view, eligibility trace.
         self.lambda_ = trace_decay
-
-    def initialize_trace(self):
-        self.eligibility_trace = self.q_table * 0
+        self.eligibility_trace = self.q_table.copy()
 
     def check_state_exist(self, state):
         if state not in self.q_table.index:
@@ -140,6 +78,11 @@ def learn(self, s, a, r, s_, a_):
         error = q_target - q_predict
 
         # increase trace amount for visited state-action pair
+
+        # Method 1:
+        # self.eligibility_trace.ix[s, a] += 1
+
+        # Method 2:
         self.eligibility_trace.ix[s, :] *= 0
         self.eligibility_trace.ix[s, a] = 1
 
 
@@ -16,9 +16,6 @@ def update():
         # initial observation
         observation = env.reset()
 
-        # initialize eligibility trace
-        RL.initialize_trace()
-
         # RL choose action based on observation
         action = RL.choose_action(str(observation))