Skip to content

Commit

Permalink
Merge pull request ShangtongZhang#109 from wlbksy/chap04
Browse files Browse the repository at this point in the history
modification for chap04
  • Loading branch information
ShangtongZhang authored Jun 12, 2019
2 parents a30e622 + 00726c8 commit d593539
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 91 deletions.
116 changes: 65 additions & 51 deletions chapter04/car_rental.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
# declaration at the top #
#######################################################################

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from math import exp, factorial
import numpy as np
import seaborn as sns
from scipy.stats import poisson

matplotlib.use('Agg')

# maximum # of cars in each location
MAX_CARS = 20
Expand Down Expand Up @@ -50,47 +51,55 @@
# Probability for poisson distribution
# @lam: lambda should be less than 10 for this function
poisson_cache = dict()
def poisson(n, lam):


def poisson_probability(n, lam):
global poisson_cache
key = n * 10 + lam
if key not in poisson_cache.keys():
poisson_cache[key] = exp(-lam) * pow(lam, n) / factorial(n)
if key not in poisson_cache:
poisson_cache[key] = poisson.pmf(n, lam)
return poisson_cache[key]

# @state: [# of cars in first location, # of cars in second location]
# @action: positive if moving cars from first location to second location,
# negative if moving cars from second location to first location
# @stateValue: state value matrix
# @constant_returned_cars: if set True, model is simplified such that
# the # of cars returned in daytime becomes constant
# rather than a random value from poisson distribution, which will reduce calculation time
# and leave the optimal policy/value state matrix almost the same

def expected_return(state, action, state_value, constant_returned_cars):
"""
@state: [# of cars in first location, # of cars in second location]
@action: positive if moving cars from first location to second location,
negative if moving cars from second location to first location
@stateValue: state value matrix
@constant_returned_cars: if set True, model is simplified such that
the # of cars returned in daytime becomes constant
rather than a random value from poisson distribution, which will reduce calculation time
and leave the optimal policy/value state matrix almost the same
"""
# initailize total return
returns = 0.0

# cost for moving cars
returns -= MOVE_CAR_COST * abs(action)

# moving cars
NUM_OF_CARS_FIRST_LOC = min(state[0] - action, MAX_CARS)
NUM_OF_CARS_SECOND_LOC = min(state[1] + action, MAX_CARS)

# go through all possible rental requests
for rental_request_first_loc in range(0, POISSON_UPPER_BOUND):
for rental_request_second_loc in range(0, POISSON_UPPER_BOUND):
# moving cars
num_of_cars_first_loc = int(min(state[0] - action, MAX_CARS))
num_of_cars_second_loc = int(min(state[1] + action, MAX_CARS))
for rental_request_first_loc in range(POISSON_UPPER_BOUND):
for rental_request_second_loc in range(POISSON_UPPER_BOUND):
# probability for current combination of rental requests
prob = poisson_probability(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \
poisson_probability(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC)

num_of_cars_first_loc = NUM_OF_CARS_FIRST_LOC
num_of_cars_second_loc = NUM_OF_CARS_SECOND_LOC

# valid rental requests should be less than actual # of cars
real_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc)
real_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc)
valid_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc)
valid_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc)

# get credits for renting
reward = (real_rental_first_loc + real_rental_second_loc) * RENTAL_CREDIT
num_of_cars_first_loc -= real_rental_first_loc
num_of_cars_second_loc -= real_rental_second_loc

# probability for current combination of rental requests
prob = poisson(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \
poisson(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC)
reward = (valid_rental_first_loc + valid_rental_second_loc) * RENTAL_CREDIT
num_of_cars_first_loc -= valid_rental_first_loc
num_of_cars_second_loc -= valid_rental_second_loc

if constant_returned_cars:
# get returned cars, those cars can be used for renting tomorrow
Expand All @@ -100,15 +109,18 @@ def expected_return(state, action, state_value, constant_returned_cars):
num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc])
else:
for returned_cars_first_loc in range(0, POISSON_UPPER_BOUND):
for returned_cars_second_loc in range(0, POISSON_UPPER_BOUND):
for returned_cars_first_loc in range(POISSON_UPPER_BOUND):
for returned_cars_second_loc in range(POISSON_UPPER_BOUND):
prob_return = poisson_probability(
returned_cars_first_loc, RETURNS_FIRST_LOC) * poisson_probability(returned_cars_second_loc, RETURNS_SECOND_LOC)
num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
prob_ = poisson(returned_cars_first_loc, RETURNS_FIRST_LOC) * \
poisson(returned_cars_second_loc, RETURNS_SECOND_LOC) * prob
returns += prob_ * (reward + DISCOUNT * state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
prob_ = prob_return * prob
returns += prob_ * (reward + DISCOUNT *
state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
return returns


def figure_4_2(constant_returned_cars=True):
value = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
policy = np.zeros(value.shape, dtype=np.int)
Expand All @@ -122,37 +134,38 @@ def figure_4_2(constant_returned_cars=True):
fig.set_ylabel('# cars at first location', fontsize=30)
fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
fig.set_xlabel('# cars at second location', fontsize=30)
fig.set_title('policy %d' % (iterations), fontsize=30)
fig.set_title('policy {}'.format(iterations), fontsize=30)

# policy evaluation (in-place)
while True:
new_value = np.copy(value)
old_value = value.copy()
for i in range(MAX_CARS + 1):
for j in range(MAX_CARS + 1):
new_value[i, j] = expected_return([i, j], policy[i, j], new_value,
constant_returned_cars)
value_change = np.abs((new_value - value)).sum()
print('value change %f' % (value_change))
value = new_value
if value_change < 1e-4:
new_state_value = expected_return([i, j], policy[i, j], value, constant_returned_cars)
value[i, j] = new_state_value
max_value_change = abs(old_value - value).max()
print('max value change {}'.format(max_value_change))
if max_value_change < 1e-4:
break

# policy improvement
new_policy = np.copy(policy)
policy_stable = True
for i in range(MAX_CARS + 1):
for j in range(MAX_CARS + 1):
old_action = policy[i, j]
action_returns = []
for action in actions:
if (action >= 0 and i >= action) or (action < 0 and j >= abs(action)):
if (0 <= action <= i) or (-j <= action <= 0):
action_returns.append(expected_return([i, j], action, value, constant_returned_cars))
else:
action_returns.append(-float('inf'))
new_policy[i, j] = actions[np.argmax(action_returns)]

policy_change = (new_policy != policy).sum()
print('policy changed in %d states' % (policy_change))
policy = new_policy
if policy_change == 0:
action_returns.append(-np.inf)
new_action = actions[np.argmax(action_returns)]
policy[i, j] = new_action
if policy_stable and old_action != new_action:
policy_stable = False
print('policy stable {}'.format(policy_stable))

if policy_stable:
fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1])
fig.set_ylabel('# cars at first location', fontsize=30)
fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
Expand All @@ -165,5 +178,6 @@ def figure_4_2(constant_returned_cars=True):
plt.savefig('../images/figure_4_2.png')
plt.close()


if __name__ == '__main__':
figure_4_2()
figure_4_2()
21 changes: 15 additions & 6 deletions chapter04/gamblers_problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
# declaration at the top #
#######################################################################

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np

matplotlib.use('Agg')

# goal
GOAL = 100
Expand All @@ -20,14 +21,19 @@
# probability of head
HEAD_PROB = 0.4


def figure_4_3():
# state value
state_value = np.zeros(GOAL + 1)
state_value[GOAL] = 1.0

sweeps_history = []

# value iteration
while True:
delta = 0.0
old_state_value = state_value.copy()
sweeps_history.append(old_state_value)

for state in STATES[1:GOAL]:
# get possilbe actions for current state
actions = np.arange(min(state, GOAL - state) + 1)
Expand All @@ -36,10 +42,10 @@ def figure_4_3():
action_returns.append(
HEAD_PROB * state_value[state + action] + (1 - HEAD_PROB) * state_value[state - action])
new_value = np.max(action_returns)
delta += np.abs(state_value[state] - new_value)
# update state value
state_value[state] = new_value
delta = abs(state_value - old_state_value).max()
if delta < 1e-9:
sweeps_history.append(state_value)
break

# compute the optimal policy
Expand All @@ -58,9 +64,11 @@ def figure_4_3():
plt.figure(figsize=(10, 20))

plt.subplot(2, 1, 1)
plt.plot(state_value)
for sweep, state_value in enumerate(sweeps_history):
plt.plot(state_value, label='sweep {}'.format(sweep))
plt.xlabel('Capital')
plt.ylabel('Value estimates')
plt.legend(loc='best')

plt.subplot(2, 1, 2)
plt.scatter(STATES, policy)
Expand All @@ -70,5 +78,6 @@ def figure_4_3():
plt.savefig('../images/figure_4_3.png')
plt.close()


if __name__ == '__main__':
figure_4_3()
72 changes: 39 additions & 33 deletions chapter04/grid_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
# declaration at the top #
#######################################################################

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')

WORLD_SIZE = 4
# left, up, right, down
ACTIONS = [np.array([0, -1]),
Expand All @@ -20,21 +21,26 @@
np.array([1, 0])]
ACTION_PROB = 0.25


def is_terminal(state):
x, y = state
return (x == 0 and y == 0) or (x == WORLD_SIZE - 1 and y == WORLD_SIZE - 1)


def step(state, action):
state = np.array(state)
next_state = (state + action).tolist()
if is_terminal(state):
return state, 0

next_state = (np.array(state) + action).tolist()
x, y = next_state

if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
next_state = state.tolist()
next_state = state

reward = -1
return next_state, reward


def draw_image(image):
fig, ax = plt.subplots()
ax.set_axis_off()
Expand All @@ -44,58 +50,58 @@ def draw_image(image):
width, height = 1.0 / ncols, 1.0 / nrows

# Add cells
for (i,j), val in np.ndenumerate(image):
# Index either the first or second item of bkg_colors based on
# a checker board pattern
idx = [j % 2, (j + 1) % 2][i % 2]
color = 'white'

for (i, j), val in np.ndenumerate(image):
tb.add_cell(i, j, width, height, text=val,
loc='center', facecolor=color)
loc='center', facecolor='white')

# Row Labels...
for i, label in enumerate(range(len(image))):
tb.add_cell(i, -1, width, height, text=label+1, loc='right',
# Row and column labels...
for i in range(len(image)):
tb.add_cell(i, -1, width, height, text=i+1, loc='right',
edgecolor='none', facecolor='none')
tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
edgecolor='none', facecolor='none')
# Column Labels...
for j, label in enumerate(range(len(image))):
tb.add_cell(-1, j, width, height/2, text=label+1, loc='center',
edgecolor='none', facecolor='none')
ax.add_table(tb)

def compute_state_value(in_place=False):

def compute_state_value(in_place=True, discount=1.0):
new_state_values = np.zeros((WORLD_SIZE, WORLD_SIZE))
state_values = new_state_values.copy()
iteration = 1
iteration = 0
while True:
src = new_state_values if in_place else state_values
if in_place:
state_values = new_state_values
else:
state_values = new_state_values.copy()
old_state_values = state_values.copy()

for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
if is_terminal([i, j]):
continue
value = 0
for action in ACTIONS:
(next_i, next_j), reward = step([i, j], action)
value += ACTION_PROB * (reward + src[next_i, next_j])
value += ACTION_PROB * (reward + discount * state_values[next_i, next_j])
new_state_values[i, j] = value
if np.sum(np.abs(new_state_values - state_values)) < 1e-4:
state_values = new_state_values.copy()

max_delta_value = abs(old_state_values - new_state_values).max()
if max_delta_value < 1e-4:
break

state_values = new_state_values.copy()
iteration += 1

return state_values, iteration
return new_state_values, iteration


def figure_4_1():
values, sync_iteration = compute_state_value(in_place=False)
# While the author suggests using in-place iterative policy evaluation,
# Figure 4.1 actually uses out-of-place version.
_, asycn_iteration = compute_state_value(in_place=True)
values, sync_iteration = compute_state_value(in_place=False)
draw_image(np.round(values, decimals=2))
print('In-place: %d iterations' % (asycn_iteration))
print('Synchronous: %d iterations' % (sync_iteration))
print('In-place: {} iterations'.format(asycn_iteration))
print('Synchronous: {} iterations'.format(sync_iteration))

plt.savefig('../images/figure_4_1.png')
plt.close()


if __name__ == '__main__':
figure_4_1()
Binary file modified images/figure_4_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
matplotlib
seaborn
tqdm
tqdm
scipy

0 comments on commit d593539

Please sign in to comment.