Skip to content

Commit

Permalink
simplify update equations respect to the book
Browse files Browse the repository at this point in the history
  • Loading branch information
wlbksy committed Jun 11, 2019
1 parent dc6b585 commit c40ff48
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 41 deletions.
36 changes: 23 additions & 13 deletions chapter02/ten_armed_testbed.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
#######################################################################

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from tqdm import trange

matplotlib.use('Agg')


class Bandit:
# @k_arm: # of arms
Expand Down Expand Up @@ -48,7 +50,7 @@ def reset(self):
# # of chosen times for each action
self.action_count = np.zeros(self.k)

self.best_action = np.argmax(self.q_true)
self.best_action = np.argmax(self.q_true)

# get an action for this bandit
def act(self):
Expand All @@ -57,7 +59,7 @@ def act(self):

if self.UCB_param is not None:
UCB_estimation = self.q_estimation + \
self.UCB_param * np.sqrt(np.log(self.time + 1) / (self.action_count + 1e-5))
self.UCB_param * np.sqrt(np.log(self.time + 1) / (self.action_count + 1e-5))
q_best = np.max(UCB_estimation)
return np.random.choice([action for action, q in enumerate(UCB_estimation) if q == q_best])

Expand All @@ -74,48 +76,51 @@ def step(self, action):
# generate the reward under N(real reward, 1)
reward = np.random.randn() + self.q_true[action]
self.time += 1
self.average_reward = (self.time - 1.0) / self.time * self.average_reward + reward / self.time
self.action_count[action] += 1
self.average_reward += (reward - self.average_reward) / self.time

if self.sample_averages:
# update estimation using sample averages
self.q_estimation[action] += 1.0 / self.action_count[action] * (reward - self.q_estimation[action])
self.q_estimation[action] += (reward - self.q_estimation[action]) / self.action_count[action]
elif self.gradient:
one_hot = np.zeros(self.k)
one_hot[action] = 1
if self.gradient_baseline:
baseline = self.average_reward
else:
baseline = 0
self.q_estimation = self.q_estimation + self.step_size * (reward - baseline) * (one_hot - self.action_prob)
self.q_estimation += self.step_size * (reward - baseline) * (one_hot - self.action_prob)
else:
# update estimation with constant step size
self.q_estimation[action] += self.step_size * (reward - self.q_estimation[action])
return reward


def simulate(runs, time, bandits):
best_action_counts = np.zeros((len(bandits), runs, time))
rewards = np.zeros(best_action_counts.shape)
for i, bandit in enumerate(bandits):
for r in tqdm(range(runs)):
for r in trange(runs):
bandit.reset()
for t in range(time):
action = bandit.act()
reward = bandit.step(action)
rewards[i, r, t] = reward
if action == bandit.best_action:
best_action_counts[i, r, t] = 1
best_action_counts = best_action_counts.mean(axis=1)
rewards = rewards.mean(axis=1)
return best_action_counts, rewards
mean_best_action_counts = best_action_counts.mean(axis=1)
mean_rewards = rewards.mean(axis=1)
return mean_best_action_counts, mean_rewards


def figure_2_1():
plt.violinplot(dataset=np.random.randn(200,10) + np.random.randn(10))
plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
plt.xlabel("Action")
plt.ylabel("Reward distribution")
plt.savefig('../images/figure_2_1.png')
plt.close()


def figure_2_2(runs=2000, time=1000):
epsilons = [0, 0.1, 0.01]
bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons]
Expand All @@ -140,6 +145,7 @@ def figure_2_2(runs=2000, time=1000):
plt.savefig('../images/figure_2_2.png')
plt.close()


def figure_2_3(runs=2000, time=1000):
bandits = []
bandits.append(Bandit(epsilon=0, initial=5, step_size=0.1))
Expand All @@ -155,6 +161,7 @@ def figure_2_3(runs=2000, time=1000):
plt.savefig('../images/figure_2_3.png')
plt.close()


def figure_2_4(runs=2000, time=1000):
bandits = []
bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True))
Expand All @@ -170,6 +177,7 @@ def figure_2_4(runs=2000, time=1000):
plt.savefig('../images/figure_2_4.png')
plt.close()


def figure_2_5(runs=2000, time=1000):
bandits = []
bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=True, true_reward=4))
Expand All @@ -182,7 +190,7 @@ def figure_2_5(runs=2000, time=1000):
'alpha = 0.4, with baseline',
'alpha = 0.4, without baseline']

for i in range(0, len(bandits)):
for i in range(len(bandits)):
plt.plot(best_action_counts[i], label=labels[i])
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
Expand All @@ -191,6 +199,7 @@ def figure_2_5(runs=2000, time=1000):
plt.savefig('../images/figure_2_5.png')
plt.close()


def figure_2_6(runs=2000, time=1000):
labels = ['epsilon-greedy', 'gradient bandit',
'UCB', 'optimistic initialization']
Expand Down Expand Up @@ -223,6 +232,7 @@ def figure_2_6(runs=2000, time=1000):
plt.savefig('../images/figure_2_6.png')
plt.close()


if __name__ == '__main__':
figure_2_1()
figure_2_2()
Expand Down
53 changes: 25 additions & 28 deletions chapter03/grid_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
# declaration at the top #
#######################################################################

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')

WORLD_SIZE = 5
A_POS = [0, 1]
A_PRIME_POS = [4, 1]
Expand All @@ -26,14 +27,14 @@
np.array([1, 0])]
ACTION_PROB = 0.25


def step(state, action):
if state == A_POS:
return A_PRIME_POS, 10
if state == B_POS:
return B_PRIME_POS, 5

state = np.array(state)
next_state = (state + action).tolist()
next_state = (np.array(state) + action).tolist()
x, y = next_state
if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
reward = -1.0
Expand All @@ -42,6 +43,7 @@ def step(state, action):
reward = 0
return next_state, reward


def draw_image(image):
fig, ax = plt.subplots()
ax.set_axis_off()
Expand All @@ -51,32 +53,27 @@ def draw_image(image):
width, height = 1.0 / ncols, 1.0 / nrows

# Add cells
for (i,j), val in np.ndenumerate(image):
# Index either the first or second item of bkg_colors based on
# a checker board pattern
idx = [j % 2, (j + 1) % 2][i % 2]
color = 'white'

tb.add_cell(i, j, width, height, text=val,
loc='center', facecolor=color)

# Row Labels...
for i, label in enumerate(range(len(image))):
tb.add_cell(i, -1, width, height, text=label+1, loc='right',
for (i, j), val in np.ndenumerate(image):
tb.add_cell(i, j, width, height, text=val,
loc='center', facecolor='white')

# Row and column labels...
for i in range(len(image)):
tb.add_cell(i, -1, width, height, text=i+1, loc='right',
edgecolor='none', facecolor='none')
# Column Labels...
for j, label in enumerate(range(len(image))):
tb.add_cell(-1, j, width, height/2, text=label+1, loc='center',
edgecolor='none', facecolor='none')
tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
edgecolor='none', facecolor='none')

ax.add_table(tb)


def figure_3_2():
value = np.zeros((WORLD_SIZE, WORLD_SIZE))
while True:
# keep iteration until convergence
new_value = np.zeros(value.shape)
for i in range(0, WORLD_SIZE):
for j in range(0, WORLD_SIZE):
new_value = np.zeros_like(value)
for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
for action in ACTIONS:
(next_i, next_j), reward = step([i, j], action)
# bellman equation
Expand All @@ -88,13 +85,14 @@ def figure_3_2():
break
value = new_value


def figure_3_5():
value = np.zeros((WORLD_SIZE, WORLD_SIZE))
while True:
# keep iteration until convergence
new_value = np.zeros(value.shape)
for i in range(0, WORLD_SIZE):
for j in range(0, WORLD_SIZE):
new_value = np.zeros_like(value)
for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
values = []
for action in ACTIONS:
(next_i, next_j), reward = step([i, j], action)
Expand All @@ -108,8 +106,7 @@ def figure_3_5():
break
value = new_value


if __name__ == '__main__':
figure_3_2()
figure_3_5()


0 comments on commit c40ff48

Please sign in to comment.