Merge pull request ShangtongZhang#109 from wlbksy/chap04

modification for chap04
changyunke · Jun 12, 2019 · d593539 · d593539
2 parents a30e622 + 00726c8
commit d593539
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 91 deletions.
diff --git a/chapter04/car_rental.py b/chapter04/car_rental.py
@@ -7,12 +7,13 @@
 # declaration at the top                                              #
 #######################################################################
 
-import numpy as np
 import matplotlib
-matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-from math import exp, factorial
+import numpy as np
 import seaborn as sns
+from scipy.stats import poisson
+
+matplotlib.use('Agg')
 
 # maximum # of cars in each location
 MAX_CARS = 20
@@ -50,47 +51,55 @@
 # Probability for poisson distribution
 # @lam: lambda should be less than 10 for this function
 poisson_cache = dict()
-def poisson(n, lam):
+
+
+def poisson_probability(n, lam):
     global poisson_cache
     key = n * 10 + lam
-    if key not in poisson_cache.keys():
-        poisson_cache[key] = exp(-lam) * pow(lam, n) / factorial(n)
+    if key not in poisson_cache:
+        poisson_cache[key] = poisson.pmf(n, lam)
     return poisson_cache[key]
 
-# @state: [# of cars in first location, # of cars in second location]
-# @action: positive if moving cars from first location to second location,
-#          negative if moving cars from second location to first location
-# @stateValue: state value matrix
-# @constant_returned_cars:  if set True, model is simplified such that
-#   the # of cars returned in daytime becomes constant
-#   rather than a random value from poisson distribution, which will reduce calculation time
-#   and leave the optimal policy/value state matrix almost the same
+
 def expected_return(state, action, state_value, constant_returned_cars):
+    """
+    @state: [# of cars in first location, # of cars in second location]
+    @action: positive if moving cars from first location to second location,
+            negative if moving cars from second location to first location
+    @stateValue: state value matrix
+    @constant_returned_cars:  if set True, model is simplified such that
+    the # of cars returned in daytime becomes constant
+    rather than a random value from poisson distribution, which will reduce calculation time
+    and leave the optimal policy/value state matrix almost the same
+    """
     # initailize total return
     returns = 0.0
 
     # cost for moving cars
     returns -= MOVE_CAR_COST * abs(action)
 
+    # moving cars
+    NUM_OF_CARS_FIRST_LOC = min(state[0] - action, MAX_CARS)
+    NUM_OF_CARS_SECOND_LOC = min(state[1] + action, MAX_CARS)
+
     # go through all possible rental requests
-    for rental_request_first_loc in range(0, POISSON_UPPER_BOUND):
-        for rental_request_second_loc in range(0, POISSON_UPPER_BOUND):
-            # moving cars
-            num_of_cars_first_loc = int(min(state[0] - action, MAX_CARS))
-            num_of_cars_second_loc = int(min(state[1] + action, MAX_CARS))
+    for rental_request_first_loc in range(POISSON_UPPER_BOUND):
+        for rental_request_second_loc in range(POISSON_UPPER_BOUND):
+            # probability for current combination of rental requests
+            prob = poisson_probability(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \
+                poisson_probability(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC)
+
+            num_of_cars_first_loc = NUM_OF_CARS_FIRST_LOC
+            num_of_cars_second_loc = NUM_OF_CARS_SECOND_LOC
 
             # valid rental requests should be less than actual # of cars
-            real_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc)
-            real_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc)
+            valid_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc)
+            valid_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc)
 
             # get credits for renting
-            reward = (real_rental_first_loc + real_rental_second_loc) * RENTAL_CREDIT
-            num_of_cars_first_loc -= real_rental_first_loc
-            num_of_cars_second_loc -= real_rental_second_loc
-
-            # probability for current combination of rental requests
-            prob = poisson(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \
-                         poisson(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC)
+            reward = (valid_rental_first_loc + valid_rental_second_loc) * RENTAL_CREDIT
+            num_of_cars_first_loc -= valid_rental_first_loc
+            num_of_cars_second_loc -= valid_rental_second_loc
 
             if constant_returned_cars:
                 # get returned cars, those cars can be used for renting tomorrow
@@ -100,15 +109,18 @@ def expected_return(state, action, state_value, constant_returned_cars):
                 num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
                 returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc])
             else:
-                for returned_cars_first_loc in range(0, POISSON_UPPER_BOUND):
-                    for returned_cars_second_loc in range(0, POISSON_UPPER_BOUND):
+                for returned_cars_first_loc in range(POISSON_UPPER_BOUND):
+                    for returned_cars_second_loc in range(POISSON_UPPER_BOUND):
+                        prob_return = poisson_probability(
+                            returned_cars_first_loc, RETURNS_FIRST_LOC) * poisson_probability(returned_cars_second_loc, RETURNS_SECOND_LOC)
                         num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
                         num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
-                        prob_ = poisson(returned_cars_first_loc, RETURNS_FIRST_LOC) * \
-                               poisson(returned_cars_second_loc, RETURNS_SECOND_LOC) * prob
-                        returns += prob_ * (reward + DISCOUNT * state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
+                        prob_ = prob_return * prob
+                        returns += prob_ * (reward + DISCOUNT *
+                                            state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
     return returns
 
+
 def figure_4_2(constant_returned_cars=True):
     value = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
     policy = np.zeros(value.shape, dtype=np.int)
@@ -122,37 +134,38 @@ def figure_4_2(constant_returned_cars=True):
         fig.set_ylabel('# cars at first location', fontsize=30)
         fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
         fig.set_xlabel('# cars at second location', fontsize=30)
-        fig.set_title('policy %d' % (iterations), fontsize=30)
+        fig.set_title('policy {}'.format(iterations), fontsize=30)
 
         # policy evaluation (in-place)
         while True:
-            new_value = np.copy(value)
+            old_value = value.copy()
             for i in range(MAX_CARS + 1):
                 for j in range(MAX_CARS + 1):
-                    new_value[i, j] = expected_return([i, j], policy[i, j], new_value,
-                                                      constant_returned_cars)
-            value_change = np.abs((new_value - value)).sum()
-            print('value change %f' % (value_change))
-            value = new_value
-            if value_change < 1e-4:
+                    new_state_value = expected_return([i, j], policy[i, j], value, constant_returned_cars)
+                    value[i, j] = new_state_value
+            max_value_change = abs(old_value - value).max()
+            print('max value change {}'.format(max_value_change))
+            if max_value_change < 1e-4:
                 break
 
         # policy improvement
-        new_policy = np.copy(policy)
+        policy_stable = True
         for i in range(MAX_CARS + 1):
             for j in range(MAX_CARS + 1):
+                old_action = policy[i, j]
                 action_returns = []
                 for action in actions:
-                    if (action >= 0 and i >= action) or (action < 0 and j >= abs(action)):
+                    if (0 <= action <= i) or (-j <= action <= 0):
                         action_returns.append(expected_return([i, j], action, value, constant_returned_cars))
                     else:
-                        action_returns.append(-float('inf'))
-                new_policy[i, j] = actions[np.argmax(action_returns)]
-
-        policy_change = (new_policy != policy).sum()
-        print('policy changed in %d states' % (policy_change))
-        policy = new_policy
-        if policy_change == 0:
+                        action_returns.append(-np.inf)
+                new_action = actions[np.argmax(action_returns)]
+                policy[i, j] = new_action
+                if policy_stable and old_action != new_action:
+                    policy_stable = False
+        print('policy stable {}'.format(policy_stable))
+
+        if policy_stable:
             fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1])
             fig.set_ylabel('# cars at first location', fontsize=30)
             fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
@@ -165,5 +178,6 @@ def figure_4_2(constant_returned_cars=True):
     plt.savefig('../images/figure_4_2.png')
     plt.close()
 
+
 if __name__ == '__main__':
-    figure_4_2()
+    figure_4_2()
diff --git a/chapter04/gamblers_problem.py b/chapter04/gamblers_problem.py
@@ -6,10 +6,11 @@
 # declaration at the top                                              #
 #######################################################################
 
-import numpy as np
 import matplotlib
-matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+import numpy as np
+
+matplotlib.use('Agg')
 
 # goal
 GOAL = 100
@@ -20,14 +21,19 @@
 # probability of head
 HEAD_PROB = 0.4
 
+
 def figure_4_3():
     # state value
     state_value = np.zeros(GOAL + 1)
     state_value[GOAL] = 1.0
 
+    sweeps_history = []
+
     # value iteration
     while True:
-        delta = 0.0
+        old_state_value = state_value.copy()
+        sweeps_history.append(old_state_value)
+
         for state in STATES[1:GOAL]:
             # get possilbe actions for current state
             actions = np.arange(min(state, GOAL - state) + 1)
@@ -36,10 +42,10 @@ def figure_4_3():
                 action_returns.append(
                     HEAD_PROB * state_value[state + action] + (1 - HEAD_PROB) * state_value[state - action])
             new_value = np.max(action_returns)
-            delta += np.abs(state_value[state] - new_value)
-            # update state value
             state_value[state] = new_value
+        delta = abs(state_value - old_state_value).max()
         if delta < 1e-9:
+            sweeps_history.append(state_value)
             break
 
     # compute the optimal policy
@@ -58,9 +64,11 @@ def figure_4_3():
     plt.figure(figsize=(10, 20))
 
     plt.subplot(2, 1, 1)
-    plt.plot(state_value)
+    for sweep, state_value in enumerate(sweeps_history):
+        plt.plot(state_value, label='sweep {}'.format(sweep))
     plt.xlabel('Capital')
     plt.ylabel('Value estimates')
+    plt.legend(loc='best')
 
     plt.subplot(2, 1, 2)
     plt.scatter(STATES, policy)
@@ -70,5 +78,6 @@ def figure_4_3():
     plt.savefig('../images/figure_4_3.png')
     plt.close()
 
+
 if __name__ == '__main__':
     figure_4_3()
diff --git a/chapter04/grid_world.py b/chapter04/grid_world.py
@@ -6,12 +6,13 @@
 # declaration at the top                                              #
 #######################################################################
 
-import numpy as np
 import matplotlib
-matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+import numpy as np
 from matplotlib.table import Table
 
+matplotlib.use('Agg')
+
 WORLD_SIZE = 4
 # left, up, right, down
 ACTIONS = [np.array([0, -1]),
@@ -20,21 +21,26 @@
            np.array([1, 0])]
 ACTION_PROB = 0.25
 
+
 def is_terminal(state):
     x, y = state
     return (x == 0 and y == 0) or (x == WORLD_SIZE - 1 and y == WORLD_SIZE - 1)
 
+
 def step(state, action):
-    state = np.array(state)
-    next_state = (state + action).tolist()
+    if is_terminal(state):
+        return state, 0
+
+    next_state = (np.array(state) + action).tolist()
     x, y = next_state
 
     if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
-        next_state = state.tolist()
+        next_state = state
 
     reward = -1
     return next_state, reward
 
+
 def draw_image(image):
     fig, ax = plt.subplots()
     ax.set_axis_off()
@@ -44,58 +50,58 @@ def draw_image(image):
     width, height = 1.0 / ncols, 1.0 / nrows
 
     # Add cells
-    for (i,j), val in np.ndenumerate(image):
-        # Index either the first or second item of bkg_colors based on
-        # a checker board pattern
-        idx = [j % 2, (j + 1) % 2][i % 2]
-        color = 'white'
-
+    for (i, j), val in np.ndenumerate(image):
         tb.add_cell(i, j, width, height, text=val,
-                    loc='center', facecolor=color)
+                    loc='center', facecolor='white')
 
-    # Row Labels...
-    for i, label in enumerate(range(len(image))):
-        tb.add_cell(i, -1, width, height, text=label+1, loc='right',
+        # Row and column labels...
+    for i in range(len(image)):
+        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
+                    edgecolor='none', facecolor='none')
+        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                     edgecolor='none', facecolor='none')
-    # Column Labels...
-    for j, label in enumerate(range(len(image))):
-        tb.add_cell(-1, j, width, height/2, text=label+1, loc='center',
-                           edgecolor='none', facecolor='none')
     ax.add_table(tb)
 
-def compute_state_value(in_place=False):
+
+def compute_state_value(in_place=True, discount=1.0):
     new_state_values = np.zeros((WORLD_SIZE, WORLD_SIZE))
-    state_values = new_state_values.copy()
-    iteration = 1
+    iteration = 0
     while True:
-        src = new_state_values if in_place else state_values
+        if in_place:
+            state_values = new_state_values
+        else:
+            state_values = new_state_values.copy()
+        old_state_values = state_values.copy()
+
         for i in range(WORLD_SIZE):
             for j in range(WORLD_SIZE):
-                if is_terminal([i, j]):
-                    continue
                 value = 0
                 for action in ACTIONS:
                     (next_i, next_j), reward = step([i, j], action)
-                    value += ACTION_PROB * (reward + src[next_i, next_j])
+                    value += ACTION_PROB * (reward + discount * state_values[next_i, next_j])
                 new_state_values[i, j] = value
-        if np.sum(np.abs(new_state_values - state_values)) < 1e-4:
-            state_values = new_state_values.copy()
+
+        max_delta_value = abs(old_state_values - new_state_values).max()
+        if max_delta_value < 1e-4:
             break
 
-        state_values = new_state_values.copy()
         iteration += 1
 
-    return state_values, iteration
+    return new_state_values, iteration
+
 
 def figure_4_1():
-    values, sync_iteration = compute_state_value(in_place=False)
+    # While the author suggests using in-place iterative policy evaluation,
+    # Figure 4.1 actually uses out-of-place version.
     _, asycn_iteration = compute_state_value(in_place=True)
+    values, sync_iteration = compute_state_value(in_place=False)
     draw_image(np.round(values, decimals=2))
-    print('In-place: %d iterations' % (asycn_iteration))
-    print('Synchronous: %d iterations' % (sync_iteration))
+    print('In-place: {} iterations'.format(asycn_iteration))
+    print('Synchronous: {} iterations'.format(sync_iteration))
 
     plt.savefig('../images/figure_4_1.png')
     plt.close()
 
+
 if __name__ == '__main__':
     figure_4_1()
diff --git a/images/figure_4_3.png b/images/figure_4_3.png
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
 matplotlib
 seaborn
-tqdm
+tqdm
+scipy