simplify update equations respect to the book

changyunke · Jun 11, 2019 · c40ff48 · c40ff48
1 parent dc6b585
commit c40ff48
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 41 deletions.
diff --git a/chapter02/ten_armed_testbed.py b/chapter02/ten_armed_testbed.py
@@ -9,10 +9,12 @@
 #######################################################################
 
 import matplotlib
-matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import numpy as np
-from tqdm import tqdm
+from tqdm import trange
+
+matplotlib.use('Agg')
+
 
 class Bandit:
     # @k_arm: # of arms
@@ -48,7 +50,7 @@ def reset(self):
         # # of chosen times for each action
         self.action_count = np.zeros(self.k)
 
-        self.best_action = np.argmax(self.q_true) 
+        self.best_action = np.argmax(self.q_true)
 
     # get an action for this bandit
     def act(self):
@@ -57,7 +59,7 @@ def act(self):
 
         if self.UCB_param is not None:
             UCB_estimation = self.q_estimation + \
-                     self.UCB_param * np.sqrt(np.log(self.time + 1) / (self.action_count + 1e-5))
+                self.UCB_param * np.sqrt(np.log(self.time + 1) / (self.action_count + 1e-5))
             q_best = np.max(UCB_estimation)
             return np.random.choice([action for action, q in enumerate(UCB_estimation) if q == q_best])
 
@@ -74,48 +76,51 @@ def step(self, action):
         # generate the reward under N(real reward, 1)
         reward = np.random.randn() + self.q_true[action]
         self.time += 1
-        self.average_reward = (self.time - 1.0) / self.time * self.average_reward + reward / self.time
         self.action_count[action] += 1
+        self.average_reward += (reward - self.average_reward) / self.time
 
         if self.sample_averages:
             # update estimation using sample averages
-            self.q_estimation[action] += 1.0 / self.action_count[action] * (reward - self.q_estimation[action])
+            self.q_estimation[action] += (reward - self.q_estimation[action]) / self.action_count[action]
         elif self.gradient:
             one_hot = np.zeros(self.k)
             one_hot[action] = 1
             if self.gradient_baseline:
                 baseline = self.average_reward
             else:
                 baseline = 0
-            self.q_estimation = self.q_estimation + self.step_size * (reward - baseline) * (one_hot - self.action_prob)
+            self.q_estimation += self.step_size * (reward - baseline) * (one_hot - self.action_prob)
         else:
             # update estimation with constant step size
             self.q_estimation[action] += self.step_size * (reward - self.q_estimation[action])
         return reward
 
+
 def simulate(runs, time, bandits):
     best_action_counts = np.zeros((len(bandits), runs, time))
     rewards = np.zeros(best_action_counts.shape)
     for i, bandit in enumerate(bandits):
-        for r in tqdm(range(runs)):
+        for r in trange(runs):
             bandit.reset()
             for t in range(time):
                 action = bandit.act()
                 reward = bandit.step(action)
                 rewards[i, r, t] = reward
                 if action == bandit.best_action:
                     best_action_counts[i, r, t] = 1
-    best_action_counts = best_action_counts.mean(axis=1)
-    rewards = rewards.mean(axis=1)
-    return best_action_counts, rewards
+    mean_best_action_counts = best_action_counts.mean(axis=1)
+    mean_rewards = rewards.mean(axis=1)
+    return mean_best_action_counts, mean_rewards
+
 
 def figure_2_1():
-    plt.violinplot(dataset=np.random.randn(200,10) + np.random.randn(10))
+    plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
     plt.xlabel("Action")
     plt.ylabel("Reward distribution")
     plt.savefig('../images/figure_2_1.png')
     plt.close()
 
+
 def figure_2_2(runs=2000, time=1000):
     epsilons = [0, 0.1, 0.01]
     bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons]
@@ -140,6 +145,7 @@ def figure_2_2(runs=2000, time=1000):
     plt.savefig('../images/figure_2_2.png')
     plt.close()
 
+
 def figure_2_3(runs=2000, time=1000):
     bandits = []
     bandits.append(Bandit(epsilon=0, initial=5, step_size=0.1))
@@ -155,6 +161,7 @@ def figure_2_3(runs=2000, time=1000):
     plt.savefig('../images/figure_2_3.png')
     plt.close()
 
+
 def figure_2_4(runs=2000, time=1000):
     bandits = []
     bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True))
@@ -170,6 +177,7 @@ def figure_2_4(runs=2000, time=1000):
     plt.savefig('../images/figure_2_4.png')
     plt.close()
 
+
 def figure_2_5(runs=2000, time=1000):
     bandits = []
     bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=True, true_reward=4))
@@ -182,7 +190,7 @@ def figure_2_5(runs=2000, time=1000):
               'alpha = 0.4, with baseline',
               'alpha = 0.4, without baseline']
 
-    for i in range(0, len(bandits)):
+    for i in range(len(bandits)):
         plt.plot(best_action_counts[i], label=labels[i])
     plt.xlabel('Steps')
     plt.ylabel('% Optimal action')
@@ -191,6 +199,7 @@ def figure_2_5(runs=2000, time=1000):
     plt.savefig('../images/figure_2_5.png')
     plt.close()
 
+
 def figure_2_6(runs=2000, time=1000):
     labels = ['epsilon-greedy', 'gradient bandit',
               'UCB', 'optimistic initialization']
@@ -223,6 +232,7 @@ def figure_2_6(runs=2000, time=1000):
     plt.savefig('../images/figure_2_6.png')
     plt.close()
 
+
 if __name__ == '__main__':
     figure_2_1()
     figure_2_2()

diff --git a/chapter03/grid_world.py b/chapter03/grid_world.py
@@ -6,12 +6,13 @@
 # declaration at the top                                              #
 #######################################################################
 
-import numpy as np
 import matplotlib
-matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+import numpy as np
 from matplotlib.table import Table
 
+matplotlib.use('Agg')
+
 WORLD_SIZE = 5
 A_POS = [0, 1]
 A_PRIME_POS = [4, 1]
@@ -26,14 +27,14 @@
            np.array([1, 0])]
 ACTION_PROB = 0.25
 
+
 def step(state, action):
     if state == A_POS:
         return A_PRIME_POS, 10
     if state == B_POS:
         return B_PRIME_POS, 5
 
-    state = np.array(state)
-    next_state = (state + action).tolist()
+    next_state = (np.array(state) + action).tolist()
     x, y = next_state
     if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
         reward = -1.0
@@ -42,6 +43,7 @@ def step(state, action):
         reward = 0
     return next_state, reward
 
+
 def draw_image(image):
     fig, ax = plt.subplots()
     ax.set_axis_off()
@@ -51,32 +53,27 @@ def draw_image(image):
     width, height = 1.0 / ncols, 1.0 / nrows
 
     # Add cells
-    for (i,j), val in np.ndenumerate(image):
-        # Index either the first or second item of bkg_colors based on
-        # a checker board pattern
-        idx = [j % 2, (j + 1) % 2][i % 2]
-        color = 'white'
-
-        tb.add_cell(i, j, width, height, text=val, 
-                    loc='center', facecolor=color)
-
-    # Row Labels...
-    for i, label in enumerate(range(len(image))):
-        tb.add_cell(i, -1, width, height, text=label+1, loc='right', 
+    for (i, j), val in np.ndenumerate(image):
+        tb.add_cell(i, j, width, height, text=val,
+                    loc='center', facecolor='white')
+
+    # Row and column labels...
+    for i in range(len(image)):
+        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                     edgecolor='none', facecolor='none')
-    # Column Labels...
-    for j, label in enumerate(range(len(image))):
-        tb.add_cell(-1, j, width, height/2, text=label+1, loc='center', 
-                           edgecolor='none', facecolor='none')
+        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
+                    edgecolor='none', facecolor='none')
+
     ax.add_table(tb)
 
+
 def figure_3_2():
     value = np.zeros((WORLD_SIZE, WORLD_SIZE))
     while True:
         # keep iteration until convergence
-        new_value = np.zeros(value.shape)
-        for i in range(0, WORLD_SIZE):
-            for j in range(0, WORLD_SIZE):
+        new_value = np.zeros_like(value)
+        for i in range(WORLD_SIZE):
+            for j in range(WORLD_SIZE):
                 for action in ACTIONS:
                     (next_i, next_j), reward = step([i, j], action)
                     # bellman equation
@@ -88,13 +85,14 @@ def figure_3_2():
             break
         value = new_value
 
+
 def figure_3_5():
     value = np.zeros((WORLD_SIZE, WORLD_SIZE))
     while True:
         # keep iteration until convergence
-        new_value = np.zeros(value.shape)
-        for i in range(0, WORLD_SIZE):
-            for j in range(0, WORLD_SIZE):
+        new_value = np.zeros_like(value)
+        for i in range(WORLD_SIZE):
+            for j in range(WORLD_SIZE):
                 values = []
                 for action in ACTIONS:
                     (next_i, next_j), reward = step([i, j], action)
@@ -108,8 +106,7 @@ def figure_3_5():
             break
         value = new_value
 
+
 if __name__ == '__main__':
     figure_3_2()
     figure_3_5()
-
-