uliang · Dec 2, 2020
diff --git a/‎Chapter05/Solving RL.ipynb
+2,117 b/‎Chapter05/Solving RL.ipynb
+2,117
@@ -0,0 +1,2117 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import gym"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dynamic Programming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class FoodTruck(gym.Env):\n",
+    "    def __init__(self):\n",
+    "        self.v_demand = [100, 200, 300, 400]\n",
+    "        self.p_demand = [0.3, 0.4, 0.2, 0.1]\n",
+    "        self.capacity = self.v_demand[-1]\n",
+    "        self.days = ['Mon', 'Tue', 'Wed', \n",
+    "                     'Thu', 'Fri', \"Weekend\"]\n",
+    "        self.unit_cost = 4\n",
+    "        self.net_revenue = 7\n",
+    "        self.action_space = [0, 100, 200, 300, 400]\n",
+    "        self.state_space = [(\"Mon\", 0)] \\\n",
+    "                            + [(d, i) for d in self.days[1:] \n",
+    "                                for i in [0, 100, 200, 300]]\n",
+    "    \n",
+    "    def get_next_state_reward(self, state, action, demand):\n",
+    "        day, inventory = state\n",
+    "        result = {}\n",
+    "        result['next_day'] = self.days[self.days.index(day) \\\n",
+    "                                       + 1]\n",
+    "        result['starting_inventory'] = min(self.capacity, \n",
+    "                                           inventory \n",
+    "                                           + action)\n",
+    "        result['cost'] = self.unit_cost * action \n",
+    "        result['sales'] = min(result['starting_inventory'], \n",
+    "                              demand)\n",
+    "        result['revenue'] = self.net_revenue * result['sales']\n",
+    "        result['next_inventory'] \\\n",
+    "            = result['starting_inventory'] - result['sales']\n",
+    "        result['reward'] = result['revenue'] - result['cost']\n",
+    "        return result\n",
+    "    \n",
+    "    def get_transition_prob(self, state, action):\n",
+    "        next_s_r_prob = {}\n",
+    "        for ix, demand in enumerate(self.v_demand):\n",
+    "            result = self.get_next_state_reward(state, \n",
+    "                                                action, \n",
+    "                                                demand)\n",
+    "            next_s = (result['next_day'],\n",
+    "                      result['next_inventory'])\n",
+    "            reward = result['reward']\n",
+    "            prob = self.p_demand[ix]\n",
+    "            if (next_s, reward) not in next_s_r_prob:\n",
+    "                next_s_r_prob[next_s, reward] = prob\n",
+    "            else:\n",
+    "                next_s_r_prob[next_s, reward] += prob\n",
+    "        return next_s_r_prob\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        self.day = \"Mon\"\n",
+    "        self.inventory = 0\n",
+    "        state = (self.day, self.inventory)\n",
+    "        return state\n",
+    "    \n",
+    "    def is_terminal(self, state):\n",
+    "        day, inventory = state\n",
+    "        if day == \"Weekend\":\n",
+    "            return True\n",
+    "        else:\n",
+    "            return False\n",
+    "    \n",
+    "    def step(self, action):\n",
+    "        demand = np.random.choice(self.v_demand, \n",
+    "                                  p=self.p_demand)\n",
+    "        result = self.get_next_state_reward((self.day, \n",
+    "                                             self.inventory), \n",
+    "                                       action, \n",
+    "                                       demand)\n",
+    "        self.day = result['next_day']\n",
+    "        self.inventory = result['next_inventory']\n",
+    "        state = (self.day, self.inventory)\n",
+    "        reward = result['reward']\n",
+    "        done = self.is_terminal(state)\n",
+    "        info = {'demand': demand, 'sales': result['sales']}\n",
+    "        return state, reward, done, info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2590.83"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Simulating an arbitrary policy\n",
+    "np.random.seed(0)\n",
+    "foodtruck = FoodTruck()\n",
+    "rewards = []\n",
+    "for i_episode in range(10000):\n",
+    "    state = foodtruck.reset()\n",
+    "    done = False\n",
+    "    ep_reward = 0\n",
+    "    while not done:\n",
+    "        day, inventory = state\n",
+    "        action = max(0, 300 - inventory)\n",
+    "        state, reward, done, info = foodtruck.step(action) \n",
+    "        ep_reward += reward\n",
+    "    rewards.append(ep_reward)\n",
+    "np.mean(rewards)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Single day expected reward\n",
+    "ucost = 4\n",
+    "uprice = 7\n",
+    "v_demand = [100, 200, 300, 400]\n",
+    "p_demand = [0.3, 0.4, 0.2, 0.1]\n",
+    "inv = 400\n",
+    "profit = uprice*np.sum([p_demand[i]*min(v_demand[i], inv) for i in range(4)]) - inv*ucost\n",
+    "print(profit)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Policy Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def base_policy(states):\n",
+    "    policy = {}\n",
+    "    for s in states:\n",
+    "        day, inventory = s\n",
+    "        prob_a = {} \n",
+    "        if inventory >= 300:\n",
+    "            prob_a[0] = 1\n",
+    "        else:\n",
+    "            prob_a[200 - inventory] = 0.5\n",
+    "            prob_a[300 - inventory] = 0.5\n",
+    "        policy[s] = prob_a\n",
+    "    return policy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def expected_update(env, v, s, prob_a, gamma):\n",
+    "    expected_value = 0\n",
+    "    for a in prob_a:\n",
+    "        prob_next_s_r = env.get_transition_prob(s, a)\n",
+    "        for next_s, r in prob_next_s_r:\n",
+    "            expected_value += prob_a[a] \\\n",
+    "                            * prob_next_s_r[next_s, r] \\\n",
+    "                            * (r + gamma * v[next_s])\n",
+    "    return expected_value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def policy_evaluation(env, policy, max_iter=100, \n",
+    "                      v = None, eps=0.1, gamma=1):\n",
+    "    if not v:\n",
+    "        v = {s: 0 for s in env.state_space}\n",
+    "    k = 0\n",
+    "    while True:\n",
+    "        max_delta = 0\n",
+    "        for s in v:\n",
+    "            if not env.is_terminal(s):\n",
+    "                v_old = v[s]\n",
+    "                prob_a = policy[s]\n",
+    "                v[s] = expected_update(env, v, \n",
+    "                                       s, prob_a, \n",
+    "                                       gamma)\n",
+    "                max_delta = max(max_delta, \n",
+    "                                abs(v[s] - v_old))\n",
+    "        k += 1\n",
+    "        if max_delta < eps:\n",
+    "            print(\"Converged in\", k, \"iterations.\")\n",
+    "            break\n",
+    "        elif k == max_iter:\n",
+    "            print(\"Terminating after\", k, \"iterations.\")\n",
+    "            break\n",
+    "    return v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "foodtruck = FoodTruck()\n",
+    "policy = base_policy(foodtruck.state_space)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converged in 6 iterations.\n",
+      "Expected weekly profit: 2515.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "v = policy_evaluation(foodtruck, policy)\n",
+    "print(\"Expected weekly profit:\", v[\"Mon\", 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The state values:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): 2515.0,\n",
+       " ('Tue', 0): 1960.0,\n",
+       " ('Tue', 100): 2360.0,\n",
+       " ('Tue', 200): 2760.0,\n",
+       " ('Tue', 300): 3205.0,\n",
+       " ('Wed', 0): 1405.0,\n",
+       " ('Wed', 100): 1805.0,\n",
+       " ('Wed', 200): 2205.0,\n",
+       " ('Wed', 300): 2650.0,\n",
+       " ('Thu', 0): 850.0000000000001,\n",
+       " ('Thu', 100): 1250.0,\n",
+       " ('Thu', 200): 1650.0,\n",
+       " ('Thu', 300): 2095.0,\n",
+       " ('Fri', 0): 295.00000000000006,\n",
+       " ('Fri', 100): 695.0000000000001,\n",
+       " ('Fri', 200): 1095.0,\n",
+       " ('Fri', 300): 1400.0,\n",
+       " ('Weekend', 0): 0,\n",
+       " ('Weekend', 100): 0,\n",
+       " ('Weekend', 200): 0,\n",
+       " ('Weekend', 300): 0}"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"The state values:\")\n",
+    "v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def choose_action(state, policy):\n",
+    "    prob_a = policy[state]\n",
+    "    action = np.random.choice(a=list(prob_a.keys()), \n",
+    "                              p=list(prob_a.values()))\n",
+    "    return action\n",
+    "\n",
+    "def simulate_policy(policy, n_episodes):\n",
+    "    np.random.seed(0)\n",
+    "    foodtruck = FoodTruck()\n",
+    "    rewards = []\n",
+    "    for i_episode in range(n_episodes):\n",
+    "        state = foodtruck.reset()\n",
+    "        done = False\n",
+    "        ep_reward = 0\n",
+    "        while not done:\n",
+    "            action = choose_action(state, policy)\n",
+    "            state, reward, done, info = foodtruck.step(action) \n",
+    "            ep_reward += reward\n",
+    "        rewards.append(ep_reward)\n",
+    "    print(\"Expected weekly profit:\", np.mean(rewards))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Expected weekly profit: 2518.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "simulate_policy(policy, 1000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Policy Iteration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def policy_improvement(env, v, s, actions, gamma):\n",
+    "    prob_a = {}\n",
+    "    if not env.is_terminal(s):\n",
+    "        max_q = np.NINF\n",
+    "        best_a = None\n",
+    "        for a in actions:\n",
+    "            q_sa = expected_update(env, v, s, {a: 1}, gamma)\n",
+    "            if q_sa >= max_q:\n",
+    "                max_q = q_sa\n",
+    "                best_a = a\n",
+    "        prob_a[best_a] = 1\n",
+    "    else:\n",
+    "        max_q = 0\n",
+    "    return prob_a, max_q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def policy_iteration(env,  eps=0.1, gamma=1):\n",
+    "    np.random.seed(1)\n",
+    "    states = env.state_space\n",
+    "    actions = env.action_space\n",
+    "    policy = {s: {np.random.choice(actions): 1}\n",
+    "             for s in states}\n",
+    "    v = {s: 0 for s in states}\n",
+    "    while True:\n",
+    "        v = policy_evaluation(env, policy, v=v, \n",
+    "                          eps=eps, gamma=gamma)\n",
+    "        old_policy = policy\n",
+    "        policy = {}\n",
+    "        for s in states:\n",
+    "            policy[s], _ = policy_improvement(env, v, s, \n",
+    "                                    actions, gamma)\n",
+    "        if old_policy == policy:\n",
+    "            break\n",
+    "    print(\"Optimal policy found!\")\n",
+    "    return policy, v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converged in 6 iterations.\n",
+      "Converged in 6 iterations.\n",
+      "Converged in 5 iterations.\n",
+      "Optimal policy found!\n",
+      "Expected weekly profit: 2880.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy, v = policy_iteration(foodtruck)\n",
+    "print(\"Expected weekly profit:\", v[\"Mon\", 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{('Mon', 0): {400: 1}, ('Tue', 0): {400: 1}, ('Tue', 100): {300: 1}, ('Tue', 200): {200: 1}, ('Tue', 300): {100: 1}, ('Wed', 0): {400: 1}, ('Wed', 100): {300: 1}, ('Wed', 200): {200: 1}, ('Wed', 300): {100: 1}, ('Thu', 0): {300: 1}, ('Thu', 100): {200: 1}, ('Thu', 200): {100: 1}, ('Thu', 300): {0: 1}, ('Fri', 0): {200: 1}, ('Fri', 100): {100: 1}, ('Fri', 200): {0: 1}, ('Fri', 300): {0: 1}, ('Weekend', 0): {}, ('Weekend', 100): {}, ('Weekend', 200): {}, ('Weekend', 300): {}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(policy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Value Iteration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def value_iteration(env, max_iter=100, eps=0.1, gamma=1):\n",
+    "    states = env.state_space\n",
+    "    actions = env.action_space\n",
+    "    v = {s: 0 for s in states}\n",
+    "    policy = {}\n",
+    "    k = 0\n",
+    "    while True:\n",
+    "        max_delta = 0\n",
+    "        for s in states:\n",
+    "            old_v = v[s]\n",
+    "            policy[s], v[s] = policy_improvement(env, \n",
+    "                                                 v, \n",
+    "                                                 s, \n",
+    "                                                 actions, \n",
+    "                                                 gamma)\n",
+    "            max_delta = max(max_delta, abs(v[s] - old_v))\n",
+    "        k += 1\n",
+    "        if max_delta < eps:\n",
+    "            print(\"Converged in\", k, \"iterations.\")\n",
+    "            break\n",
+    "        elif k == max_iter:\n",
+    "            print(\"Terminating after\", k, \"iterations.\")\n",
+    "            break\n",
+    "    return policy, v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converged in 6 iterations.\n",
+      "6\n",
+      "Expected weekly profit: 2880.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy, v = value_iteration(foodtruck)\n",
+    "print(\"Expected weekly profit:\", v[\"Mon\", 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(policy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generalized_policy_iteration(env, max_iter=2, eps=0.1, gamma=1):\n",
+    "    np.random.seed(1)\n",
+    "    states =  env.observation_space\n",
+    "    actions = env.action_space\n",
+    "    policy = {s: {np.random.choice(actions): 1}\n",
+    "             for s in states}\n",
+    "    v = {s: 0 for s in states}\n",
+    "    k = 0\n",
+    "    while True:\n",
+    "        v_old = v.copy()\n",
+    "        policy = {}\n",
+    "        for s in states:\n",
+    "            policy[s], v[s] = policy_improvement(env, v, s, \n",
+    "                                    actions, gamma)\n",
+    "        v = policy_evaluation(env, policy, \n",
+    "                              max_iter=max_iter, v=v, \n",
+    "                              eps=eps, gamma=gamma)\n",
+    "        max_delta = np.amax([abs(v[s] - v_old[s]) for s in v])\n",
+    "        k += 1\n",
+    "        if max_delta < eps:\n",
+    "            print(\"GPI converged in\", k, \"iterations.\")\n",
+    "            print([abs(v[s] - v_old[s]) for s in v])\n",
+    "            break\n",
+    "            \n",
+    "    print(\"Optimal policy found!\")\n",
+    "    return policy, v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy, v = generalized_policy_iteration(foodtruck, max_iter=2, eps=0.1, gamma=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Expected weekly profit:\", v[\"Mon\", 0])\n",
+    "print(policy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "v"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Monte Carlo Methods"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MC Prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def first_visit_return(returns, trajectory, gamma):\n",
+    "    G = 0\n",
+    "    T = len(trajectory) - 1\n",
+    "    for t, sar in enumerate(reversed(trajectory)):\n",
+    "        s, a, r = sar\n",
+    "        G = r + gamma * G\n",
+    "        first_visit = True\n",
+    "        for j in range(T - t):\n",
+    "            if s == trajectory[j][0]:\n",
+    "                first_visit = False\n",
+    "        if first_visit:\n",
+    "            if s in returns:\n",
+    "                returns[s].append(G)\n",
+    "            else:\n",
+    "                returns[s] = [G]\n",
+    "    return returns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_trajectory(env, policy):\n",
+    "    trajectory = []\n",
+    "    state = env.reset()\n",
+    "    done = False\n",
+    "    sar = [state]\n",
+    "    while not done:\n",
+    "        action = choose_action(state, policy)\n",
+    "        state, reward, done, info = env.step(action)\n",
+    "        sar.append(action)\n",
+    "        sar.append(reward)\n",
+    "        trajectory.append(sar)\n",
+    "        sar = [state]\n",
+    "    return trajectory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def first_visit_mc(env, policy, gamma, n_trajectories):\n",
+    "    np.random.seed(0)\n",
+    "    returns = {}\n",
+    "    v = {}\n",
+    "    for i in range(n_trajectories):\n",
+    "        trajectory = get_trajectory(env, policy)\n",
+    "        returns = first_visit_return(returns, \n",
+    "                                     trajectory, \n",
+    "                                     gamma)\n",
+    "    for s in env.state_space:\n",
+    "        if s in returns:\n",
+    "            v[s] = np.round(np.mean(returns[s]), 1)\n",
+    "    return v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "foodtruck = FoodTruck()\n",
+    "policy = base_policy(foodtruck.state_space)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): 2515.9,\n",
+       " ('Tue', 0): 1959.1,\n",
+       " ('Tue', 100): 2362.2,\n",
+       " ('Tue', 200): 2765.2,\n",
+       " ('Wed', 0): 1411.3,\n",
+       " ('Wed', 100): 1804.2,\n",
+       " ('Wed', 200): 2198.9,\n",
+       " ('Thu', 0): 852.9,\n",
+       " ('Thu', 100): 1265.4,\n",
+       " ('Thu', 200): 1644.4,\n",
+       " ('Fri', 0): 301.1,\n",
+       " ('Fri', 100): 696.5,\n",
+       " ('Fri', 200): 1097.2}"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "v_est = first_visit_mc(foodtruck, policy, 1, 10000)\n",
+    "v_est"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converged in 6 iterations.\n"
+     ]
+    }
+   ],
+   "source": [
+    "v_true = policy_evaluation(foodtruck, policy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): 2515.0,\n",
+       " ('Tue', 0): 1960.0,\n",
+       " ('Tue', 100): 2360.0,\n",
+       " ('Tue', 200): 2760.0,\n",
+       " ('Tue', 300): 3205.0,\n",
+       " ('Wed', 0): 1405.0,\n",
+       " ('Wed', 100): 1805.0,\n",
+       " ('Wed', 200): 2205.0,\n",
+       " ('Wed', 300): 2650.0,\n",
+       " ('Thu', 0): 850.0000000000001,\n",
+       " ('Thu', 100): 1250.0,\n",
+       " ('Thu', 200): 1650.0,\n",
+       " ('Thu', 300): 2095.0,\n",
+       " ('Fri', 0): 295.00000000000006,\n",
+       " ('Fri', 100): 695.0000000000001,\n",
+       " ('Fri', 200): 1095.0,\n",
+       " ('Fri', 300): 1400.0,\n",
+       " ('Weekend', 0): 0,\n",
+       " ('Weekend', 100): 0,\n",
+       " ('Weekend', 200): 0,\n",
+       " ('Weekend', 300): 0}"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "v_true"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v_est = first_visit_mc(foodtruck, policy, 1, 5)\n",
+    "# {s: v_est[s] for s in sorted(v_est)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v_est = first_visit_mc(foodtruck, policy, 1, 10)\n",
+    "# {s: v_est[s] for s in sorted(v_est)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v_est = first_visit_mc(foodtruck, policy, 1, 100)\n",
+    "# {s: v_est[s] for s in sorted(v_est)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v_est = first_visit_mc(foodtruck, policy, 1, 1000)\n",
+    "# {s: v_est[s] for s in sorted(v_est)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v_est = first_visit_mc(foodtruck, policy, 1, 10000)\n",
+    "# {s: v_est[s] for s in sorted(v_est)}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## On-policy Monte Carlo Control"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import operator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_eps_greedy(actions, eps, a_best):\n",
+    "    prob_a = {}\n",
+    "    n_a = len(actions)\n",
+    "    for a in actions:\n",
+    "        if a == a_best:\n",
+    "            prob_a[a] = 1 - eps + eps/n_a\n",
+    "        else:\n",
+    "            prob_a[a] = eps/n_a\n",
+    "    return prob_a"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_random_policy(states, actions):\n",
+    "    policy = {}\n",
+    "    n_a = len(actions)\n",
+    "    for s in states:\n",
+    "        policy[s] = {a: 1/n_a for a in actions}\n",
+    "    return policy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def on_policy_first_visit_mc(env, n_iter, eps, gamma):\n",
+    "    np.random.seed(0)\n",
+    "    states =  env.state_space\n",
+    "    actions = env.action_space\n",
+    "    policy =  get_random_policy(states, actions)\n",
+    "    Q = {s: {a: 0 for a in actions} for s in states}\n",
+    "    Q_n = {s: {a: 0 for a in actions} for s in states}\n",
+    "    for i in range(n_iter):\n",
+    "        if i % 10000 == 0:\n",
+    "            print(\"Iteration:\", i)\n",
+    "        trajectory = get_trajectory(env, policy)\n",
+    "        G = 0\n",
+    "        T = len(trajectory) - 1\n",
+    "        for t, sar in enumerate(reversed(trajectory)):\n",
+    "            s, a, r = sar\n",
+    "            G = r + gamma * G\n",
+    "            first_visit = True\n",
+    "            for j in range(T - t):\n",
+    "                s_j = trajectory[j][0]\n",
+    "                a_j = trajectory[j][1]\n",
+    "                if (s, a) == (s_j, a_j):\n",
+    "                    first_visit = False\n",
+    "            if first_visit:\n",
+    "                Q[s][a] = Q_n[s][a] * Q[s][a] + G\n",
+    "                Q_n[s][a] += 1\n",
+    "                Q[s][a] /= Q_n[s][a]\n",
+    "                a_best = max(Q[s].items(), \n",
+    "                             key=operator.itemgetter(1))[0]\n",
+    "                policy[s] = get_eps_greedy(actions, \n",
+    "                                           eps, \n",
+    "                                           a_best)\n",
+    "    return policy, Q, Q_n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 0\n",
+      "Iteration: 10000\n",
+      "Iteration: 20000\n",
+      "Iteration: 30000\n",
+      "Iteration: 40000\n",
+      "Iteration: 50000\n",
+      "Iteration: 60000\n",
+      "Iteration: 70000\n",
+      "Iteration: 80000\n",
+      "Iteration: 90000\n",
+      "Iteration: 100000\n",
+      "Iteration: 110000\n",
+      "Iteration: 120000\n",
+      "Iteration: 130000\n",
+      "Iteration: 140000\n",
+      "Iteration: 150000\n",
+      "Iteration: 160000\n",
+      "Iteration: 170000\n",
+      "Iteration: 180000\n",
+      "Iteration: 190000\n",
+      "Iteration: 200000\n",
+      "Iteration: 210000\n",
+      "Iteration: 220000\n",
+      "Iteration: 230000\n",
+      "Iteration: 240000\n",
+      "Iteration: 250000\n",
+      "Iteration: 260000\n",
+      "Iteration: 270000\n",
+      "Iteration: 280000\n",
+      "Iteration: 290000\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy, Q, Q_n = on_policy_first_visit_mc(foodtruck, \n",
+    "                                          300000, \n",
+    "                                          0.05, \n",
+    "                                          1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},\n",
+       " ('Tue', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},\n",
+       " ('Tue', 100): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},\n",
+       " ('Tue', 200): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},\n",
+       " ('Tue', 300): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Wed', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},\n",
+       " ('Wed', 100): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},\n",
+       " ('Wed', 200): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},\n",
+       " ('Wed', 300): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Thu', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},\n",
+       " ('Thu', 100): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},\n",
+       " ('Thu', 200): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Thu', 300): {0: 0.96, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Fri', 0): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},\n",
+       " ('Fri', 100): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Fri', 200): {0: 0.96, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Fri', 300): {0: 0.96, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.01},\n",
+       " ('Weekend', 0): {0: 0.2, 100: 0.2, 200: 0.2, 300: 0.2, 400: 0.2},\n",
+       " ('Weekend', 100): {0: 0.2, 100: 0.2, 200: 0.2, 300: 0.2, 400: 0.2},\n",
+       " ('Weekend', 200): {0: 0.2, 100: 0.2, 200: 0.2, 300: 0.2, 400: 0.2},\n",
+       " ('Weekend', 300): {0: 0.2, 100: 0.2, 200: 0.2, 300: 0.2, 400: 0.2}}"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "policy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): {0: 2162.733333333329,\n",
+       "  100: 2468.4210526315796,\n",
+       "  200: 2668.7695190505888,\n",
+       "  300: 2739.300098231826,\n",
+       "  400: 2809.1632287569414},\n",
+       " ('Tue', 0): {0: 1539.1011235955057,\n",
+       "  100: 1857.630979498861,\n",
+       "  200: 2018.3222958057395,\n",
+       "  300: 2101.97486535009,\n",
+       "  400: 2181.249139237035},\n",
+       " ('Tue', 100): {0: 2243.7967115097176,\n",
+       "  100: 2410.7182940516295,\n",
+       "  200: 2537.853107344635,\n",
+       "  300: 2587.222441722628,\n",
+       "  400: 2170.4049844236765},\n",
+       " ('Tue', 200): {0: 2828.295819935689,\n",
+       "  100: 2953.6330631123433,\n",
+       "  200: 2996.437255166801,\n",
+       "  300: 2623.82297551789,\n",
+       "  400: 2224.710080285464},\n",
+       " ('Tue', 300): {0: 3383.880037488284,\n",
+       "  100: 3395.720002238628,\n",
+       "  200: 2939.4218134034168,\n",
+       "  300: 2572.2506393861877,\n",
+       "  400: 2162.3395149786},\n",
+       " ('Wed', 0): {0: 935.7142857142857,\n",
+       "  100: 1256.8720379146928,\n",
+       "  200: 1400.5025125628129,\n",
+       "  300: 1547.1040492055338,\n",
+       "  400: 1579.8683874265244},\n",
+       " ('Wed', 100): {0: 1639.7689768976904,\n",
+       "  100: 1868.1431005110733,\n",
+       "  200: 1908.107074569789,\n",
+       "  300: 1989.5285532259934,\n",
+       "  400: 1605.021520803444},\n",
+       " ('Wed', 200): {0: 2250.352733686064,\n",
+       "  100: 2341.068532900906,\n",
+       "  200: 2383.0059803588124,\n",
+       "  300: 1962.005277044855,\n",
+       "  400: 1573.4144222415298},\n",
+       " ('Wed', 300): {0: 2758.00389203214,\n",
+       "  100: 2778.022627490717,\n",
+       "  200: 2393.5081148564277,\n",
+       "  300: 1985.8374384236454,\n",
+       "  400: 1614.6220570012397},\n",
+       " ('Thu', 0): {0: 369.36619718309856,\n",
+       "  100: 684.2803030303028,\n",
+       "  200: 903.1539888682744,\n",
+       "  300: 972.1787871266652,\n",
+       "  400: 930.1247771836006},\n",
+       " ('Thu', 100): {0: 1084.478371501272,\n",
+       "  100: 1289.5073754522657,\n",
+       "  200: 1372.1298508969842,\n",
+       "  300: 1332.386447699365,\n",
+       "  400: 953.6523929471032},\n",
+       " ('Thu', 200): {0: 1677.668161434978,\n",
+       "  100: 1769.2753842946279,\n",
+       "  200: 1733.8299737072743,\n",
+       "  300: 1325.3393665158371,\n",
+       "  400: 919.6219621962197},\n",
+       " ('Thu', 300): {0: 2169.691663233083,\n",
+       "  100: 2166.585956416466,\n",
+       "  200: 1757.9545454545455,\n",
+       "  300: 1333.6569579288014,\n",
+       "  400: 953.3227848101266},\n",
+       " ('Fri', 0): {0: 0.0,\n",
+       "  100: 300.0,\n",
+       "  200: 388.81505831705283,\n",
+       "  300: 186.4516129032258,\n",
+       "  400: -142.74809160305333},\n",
+       " ('Fri', 100): {0: 700.0,\n",
+       "  100: 790.5049146968516,\n",
+       "  200: 607.3234524847425,\n",
+       "  300: 267.3796791443842,\n",
+       "  400: -110.91954022988506},\n",
+       " ('Fri', 200): {0: 1190.3311990960892,\n",
+       "  100: 988.7775551102206,\n",
+       "  200: 640.5092592592597,\n",
+       "  300: 267.4418604651163,\n",
+       "  400: -112.39263803680979},\n",
+       " ('Fri', 300): {0: 1399.4254760341432,\n",
+       "  100: 1152.7272727272725,\n",
+       "  200: 742.1875,\n",
+       "  300: 284.4827586206896,\n",
+       "  400: -120.0},\n",
+       " ('Weekend', 0): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       " ('Weekend', 100): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       " ('Weekend', 200): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       " ('Weekend', 300): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0}}"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Q"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Off-policy Monte Carlo Control"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def off_policy_mc(env, n_iter, eps, gamma):\n",
+    "    np.random.seed(0)\n",
+    "    states =  env.state_space\n",
+    "    actions = env.action_space\n",
+    "    Q = {s: {a: 0 for a in actions} for s in states}\n",
+    "    C = {s: {a: 0 for a in actions} for s in states}\n",
+    "    target_policy = {}\n",
+    "    behavior_policy = get_random_policy(states, \n",
+    "                                        actions)\n",
+    "    for i in range(n_iter):\n",
+    "        if i % 10000 == 0:\n",
+    "            print(\"Iteration:\", i)\n",
+    "        trajectory = get_trajectory(env, \n",
+    "                                    behavior_policy)\n",
+    "        G = 0\n",
+    "        W = 1\n",
+    "        T = len(trajectory) - 1\n",
+    "        for t, sar in enumerate(reversed(trajectory)):\n",
+    "            s, a, r = sar\n",
+    "            G = r + gamma * G\n",
+    "            C[s][a] += W\n",
+    "            Q[s][a] += (W/C[s][a]) * (G - Q[s][a])\n",
+    "            a_best = max(Q[s].items(), \n",
+    "                         key=operator.itemgetter(1))[0]\n",
+    "            target_policy[s] = a_best\n",
+    "            behavior_policy[s] = get_eps_greedy(actions, \n",
+    "                                                eps, \n",
+    "                                                a_best)\n",
+    "            if a != target_policy[s]:\n",
+    "                break\n",
+    "            W = W / behavior_policy[s][a]\n",
+    "    target_policy = {s: target_policy[s] for s in states\n",
+    "                                   if s in target_policy}\n",
+    "    return target_policy, Q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 0\n",
+      "Iteration: 10000\n",
+      "Iteration: 20000\n",
+      "Iteration: 30000\n",
+      "Iteration: 40000\n",
+      "Iteration: 50000\n",
+      "Iteration: 60000\n",
+      "Iteration: 70000\n",
+      "Iteration: 80000\n",
+      "Iteration: 90000\n",
+      "Iteration: 100000\n",
+      "Iteration: 110000\n",
+      "Iteration: 120000\n",
+      "Iteration: 130000\n",
+      "Iteration: 140000\n",
+      "Iteration: 150000\n",
+      "Iteration: 160000\n",
+      "Iteration: 170000\n",
+      "Iteration: 180000\n",
+      "Iteration: 190000\n",
+      "Iteration: 200000\n",
+      "Iteration: 210000\n",
+      "Iteration: 220000\n",
+      "Iteration: 230000\n",
+      "Iteration: 240000\n",
+      "Iteration: 250000\n",
+      "Iteration: 260000\n",
+      "Iteration: 270000\n",
+      "Iteration: 280000\n",
+      "Iteration: 290000\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy, Q = off_policy_mc(foodtruck, 300000, 0.05, 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): 400,\n",
+       " ('Tue', 0): 400,\n",
+       " ('Tue', 100): 300,\n",
+       " ('Tue', 200): 200,\n",
+       " ('Tue', 300): 100,\n",
+       " ('Wed', 0): 400,\n",
+       " ('Wed', 100): 300,\n",
+       " ('Wed', 200): 200,\n",
+       " ('Wed', 300): 100,\n",
+       " ('Thu', 0): 300,\n",
+       " ('Thu', 100): 200,\n",
+       " ('Thu', 200): 100,\n",
+       " ('Thu', 300): 0,\n",
+       " ('Fri', 0): 200,\n",
+       " ('Fri', 100): 100,\n",
+       " ('Fri', 200): 0,\n",
+       " ('Fri', 300): 0}"
+      ]
+     },
+     "execution_count": 110,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "policy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): {0: 2232.674050632915,\n",
+       "  100: 2539.364696421396,\n",
+       "  200: 2725.681570338065,\n",
+       "  300: 2822.8136882129284,\n",
+       "  400: 2878.458190025779},\n",
+       " ('Tue', 0): {0: 1594.8051948051952,\n",
+       "  100: 1928.976034858388,\n",
+       "  200: 2067.4576271186465,\n",
+       "  300: 2207.8512396694205,\n",
+       "  400: 2239.8886329583893},\n",
+       " ('Tue', 100): {0: 2318.9435336976317,\n",
+       "  100: 2536.8012422360302,\n",
+       "  200: 2549.486301369862,\n",
+       "  300: 2650.193090274893,\n",
+       "  400: 2256.120527306967},\n",
+       " ('Tue', 200): {0: 2922.175290390706,\n",
+       "  100: 3012.8990770161868,\n",
+       "  200: 3052.4769607403373,\n",
+       "  300: 2689.515219842163,\n",
+       "  400: 2293.305439330548},\n",
+       " ('Tue', 300): {0: 3420.032031538755,\n",
+       "  100: 3453.749726573689,\n",
+       "  200: 3014.1210374639763,\n",
+       "  300: 2635.802469135803,\n",
+       "  400: 2233.3333333333344},\n",
+       " ('Wed', 0): {0: 927.9702970297026,\n",
+       "  100: 1303.1026252983302,\n",
+       "  200: 1428.831168831168,\n",
+       "  300: 1566.1498708010329,\n",
+       "  400: 1616.5133331502423},\n",
+       " ('Wed', 100): {0: 1683.8652482269495,\n",
+       "  100: 1896.0360360360366,\n",
+       "  200: 1976.8450184501858,\n",
+       "  300: 2024.3386976631361,\n",
+       "  400: 1650.87440381558},\n",
+       " ('Wed', 200): {0: 2277.8664007976076,\n",
+       "  100: 2405.7504873294333,\n",
+       "  200: 2419.006699098848,\n",
+       "  300: 2000.3857280617174,\n",
+       "  400: 1608.8068181818178},\n",
+       " ('Wed', 300): {0: 2779.4180573384715,\n",
+       "  100: 2818.4754229486366,\n",
+       "  200: 2422.7878787878817,\n",
+       "  300: 2017.7989130434773,\n",
+       "  400: 1660.6602475928496},\n",
+       " ('Thu', 0): {0: 369.164265129683,\n",
+       "  100: 684.9275362318838,\n",
+       "  200: 912.9056047197645,\n",
+       "  300: 988.2722582352171,\n",
+       "  400: 926.3157894736842},\n",
+       " ('Thu', 100): {0: 1090.7738095238096,\n",
+       "  100: 1329.5566502463064,\n",
+       "  200: 1392.0507055220148,\n",
+       "  300: 1373.6577181208052,\n",
+       "  400: 961.7241379310348},\n",
+       " ('Thu', 200): {0: 1699.1087344028504,\n",
+       "  100: 1789.1752957897363,\n",
+       "  200: 1760.222222222221,\n",
+       "  300: 1342.7149321266952,\n",
+       "  400: 965.2557319223995},\n",
+       " ('Thu', 300): {0: 2190.6271182185546,\n",
+       "  100: 2176.451612903226,\n",
+       "  200: 1780.9290953545235,\n",
+       "  300: 1360.7017543859658,\n",
+       "  400: 964.203233256352},\n",
+       " ('Fri', 0): {0: 0.0,\n",
+       "  100: 300.0,\n",
+       "  200: 388.8413403310466,\n",
+       "  300: 189.6405919661735,\n",
+       "  400: -146.61016949152557},\n",
+       " ('Fri', 100): {0: 700.0,\n",
+       "  100: 790.0458861880747,\n",
+       "  200: 608.920985556499,\n",
+       "  300: 265.5866900175128,\n",
+       "  400: -103.34967320261451},\n",
+       " ('Fri', 200): {0: 1190.388245916431,\n",
+       "  100: 1009.6551724137929,\n",
+       "  200: 651.612903225807,\n",
+       "  300: 266.99669966996686,\n",
+       "  400: -116.64641555285537},\n",
+       " ('Fri', 300): {0: 1404.084014002334,\n",
+       "  100: 1116.6666666666667,\n",
+       "  200: 702.9411764705883,\n",
+       "  300: 282.3529411764706,\n",
+       "  400: -175.86206896551724},\n",
+       " ('Weekend', 0): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       " ('Weekend', 100): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       " ('Weekend', 200): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       " ('Weekend', 300): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0}}"
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TD Learning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TD Prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def one_step_td_prediction(env, policy, gamma, alpha, n_iter):\n",
+    "    np.random.seed(0)\n",
+    "    states = env.state_space\n",
+    "    v = {s: 0 for s in states}\n",
+    "    s = env.reset()\n",
+    "    for i in range(n_iter):\n",
+    "        a = choose_action(s, policy)\n",
+    "        s_next, reward, done, info = env.step(a)\n",
+    "        v[s] += alpha * (reward + gamma * v[s_next] - v[s])\n",
+    "        if done:\n",
+    "            s = env.reset()\n",
+    "        else:\n",
+    "            s = s_next\n",
+    "    return v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): 2506.576417395407,\n",
+       " ('Tue', 0): 1956.077876400167,\n",
+       " ('Tue', 100): 2368.7400039407535,\n",
+       " ('Tue', 200): 2767.5069659225423,\n",
+       " ('Tue', 300): 0,\n",
+       " ('Wed', 0): 1413.0055559001296,\n",
+       " ('Wed', 100): 1813.546186490315,\n",
+       " ('Wed', 200): 2200.8873259700867,\n",
+       " ('Wed', 300): 0,\n",
+       " ('Thu', 0): 828.2915189850011,\n",
+       " ('Thu', 100): 1280.424626614422,\n",
+       " ('Thu', 200): 1675.8661846955831,\n",
+       " ('Thu', 300): 0,\n",
+       " ('Fri', 0): 345.52991944823583,\n",
+       " ('Fri', 100): 677.4358179389413,\n",
+       " ('Fri', 200): 1094.8263154150825,\n",
+       " ('Fri', 300): 0,\n",
+       " ('Weekend', 0): 0,\n",
+       " ('Weekend', 100): 0,\n",
+       " ('Weekend', 200): 0,\n",
+       " ('Weekend', 300): 0}"
+      ]
+     },
+     "execution_count": 117,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "policy = base_policy(foodtruck.state_space)\n",
+    "v = one_step_td_prediction(foodtruck, policy, 1, 0.01, 100000)\n",
+    "v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print({s: np.round(v[s]) for s in v})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "True values\n",
+    "{('Mon', 0): 2515.0,\n",
+    " ('Tue', 0): 1960.0,\n",
+    " ('Tue', 100): 2360.0,\n",
+    " ('Tue', 200): 2760.0,\n",
+    " ('Tue', 300): 3205.0,\n",
+    " ('Wed', 0): 1405.0,\n",
+    " ('Wed', 100): 1805.0,\n",
+    " ('Wed', 200): 2205.0,\n",
+    " ('Wed', 300): 2650.0,\n",
+    " ('Thu', 0): 850.0000000000001,\n",
+    " ('Thu', 100): 1250.0,\n",
+    " ('Thu', 200): 1650.0,\n",
+    " ('Thu', 300): 2095.0,\n",
+    " ('Fri', 0): 295.00000000000006,\n",
+    " ('Fri', 100): 695.0000000000001,\n",
+    " ('Fri', 200): 1095.0,\n",
+    " ('Fri', 300): 1400.0,\n",
+    " ('Weekend', 0): 0,\n",
+    " ('Weekend', 100): 0,\n",
+    " ('Weekend', 200): 0,\n",
+    " ('Weekend', 300): 0}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sarsa(env, gamma, eps, alpha, n_iter):\n",
+    "    np.random.seed(0)\n",
+    "    states = env.state_space\n",
+    "    actions = env.action_space\n",
+    "    Q = {s: {a: 0 for a in actions} for s in states}\n",
+    "    policy = get_random_policy(states, actions)\n",
+    "    s = env.reset()\n",
+    "    a = choose_action(s, policy)\n",
+    "    for i in range(n_iter):\n",
+    "        if i % 100000 == 0:\n",
+    "            print(\"Iteration:\", i)\n",
+    "        s_next, reward, done, info = env.step(a)\n",
+    "        a_best = max(Q[s_next].items(), \n",
+    "                     key=operator.itemgetter(1))[0]\n",
+    "        policy[s_next] = get_eps_greedy(actions, eps, a_best)\n",
+    "        a_next = choose_action(s_next, policy)\n",
+    "        Q[s][a] += alpha * (reward \n",
+    "                            + gamma * Q[s_next][a_next] \n",
+    "                            - Q[s][a])\n",
+    "        if done:\n",
+    "            s = env.reset()\n",
+    "            a_best = max(Q[s].items(), \n",
+    "                         key=operator.itemgetter(1))[0]\n",
+    "            policy[s] = get_eps_greedy(actions, eps, a_best)\n",
+    "            a = choose_action(s, policy)\n",
+    "        else:\n",
+    "            s = s_next\n",
+    "            a = a_next\n",
+    "    return policy, Q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 0\n",
+      "Iteration: 100000\n",
+      "Iteration: 200000\n",
+      "Iteration: 300000\n",
+      "Iteration: 400000\n",
+      "Iteration: 500000\n",
+      "Iteration: 600000\n",
+      "Iteration: 700000\n",
+      "Iteration: 800000\n",
+      "Iteration: 900000\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy, Q = sarsa(foodtruck, 1, 0.1, 0.01, 1000000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},\n",
+       " ('Tue', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},\n",
+       " ('Tue', 100): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},\n",
+       " ('Tue', 200): {0: 0.02, 100: 0.92, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Tue', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Wed', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},\n",
+       " ('Wed', 100): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},\n",
+       " ('Wed', 200): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},\n",
+       " ('Wed', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Thu', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},\n",
+       " ('Thu', 100): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},\n",
+       " ('Thu', 200): {0: 0.02, 100: 0.92, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Thu', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Fri', 0): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},\n",
+       " ('Fri', 100): {0: 0.02, 100: 0.92, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Fri', 200): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Fri', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Weekend', 0): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Weekend', 100): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Weekend', 200): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},\n",
+       " ('Weekend', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02}}"
+      ]
+     },
+     "execution_count": 120,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "policy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{0: 2099.8661156763687,\n",
+       " 100: 2399.8190742726747,\n",
+       " 200: 2604.6629056622382,\n",
+       " 300: 2670.098987213351,\n",
+       " 400: 2632.8387133517112}"
+      ]
+     },
+     "execution_count": 121,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Q[('Mon', 0)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def q_learning(env, gamma, eps, alpha, n_iter):\n",
+    "    np.random.seed(0)\n",
+    "    states =  env.state_space\n",
+    "    actions = env.action_space\n",
+    "    Q = {s: {a: 0 for a in actions} for s in states}\n",
+    "    policy = get_random_policy(states, actions)\n",
+    "    s = env.reset()\n",
+    "    for i in range(n_iter):\n",
+    "        if i % 100000 == 0:\n",
+    "            print(\"Iteration:\", i)\n",
+    "        a_best = max(Q[s].items(), \n",
+    "                     key=operator.itemgetter(1))[0]\n",
+    "        policy[s] = get_eps_greedy(actions, eps, a_best)\n",
+    "        a = choose_action(s, policy)\n",
+    "        s_next, reward, done, info = env.step(a)\n",
+    "        Q[s][a] += alpha * (reward \n",
+    "                            + gamma * max(Q[s_next].values()) \n",
+    "                            - Q[s][a])\n",
+    "        if done:\n",
+    "            s = env.reset()\n",
+    "        else:\n",
+    "            s = s_next\n",
+    "    policy = {s: {max(policy[s].items(), \n",
+    "                 key=operator.itemgetter(1))[0]: 1}\n",
+    "                 for s in states}\n",
+    "    return policy, Q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 0\n",
+      "Iteration: 100000\n",
+      "Iteration: 200000\n",
+      "Iteration: 300000\n",
+      "Iteration: 400000\n",
+      "Iteration: 500000\n",
+      "Iteration: 600000\n",
+      "Iteration: 700000\n",
+      "Iteration: 800000\n",
+      "Iteration: 900000\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{('Mon', 0): {400: 1},\n",
+       " ('Tue', 0): {400: 1},\n",
+       " ('Tue', 100): {300: 1},\n",
+       " ('Tue', 200): {200: 1},\n",
+       " ('Tue', 300): {100: 1},\n",
+       " ('Wed', 0): {400: 1},\n",
+       " ('Wed', 100): {300: 1},\n",
+       " ('Wed', 200): {200: 1},\n",
+       " ('Wed', 300): {100: 1},\n",
+       " ('Thu', 0): {300: 1},\n",
+       " ('Thu', 100): {200: 1},\n",
+       " ('Thu', 200): {100: 1},\n",
+       " ('Thu', 300): {0: 1},\n",
+       " ('Fri', 0): {200: 1},\n",
+       " ('Fri', 100): {100: 1},\n",
+       " ('Fri', 200): {0: 1},\n",
+       " ('Fri', 300): {0: 1},\n",
+       " ('Weekend', 0): {0: 1},\n",
+       " ('Weekend', 100): {0: 1},\n",
+       " ('Weekend', 200): {0: 1},\n",
+       " ('Weekend', 300): {0: 1}}"
+      ]
+     },
+     "execution_count": 123,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "policy, Q = q_learning(foodtruck, 1, 0.1, 0.01, 1000000)\n",
+    "policy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 0\n",
+      "Iteration: 100000\n",
+      "Iteration: 200000\n",
+      "Iteration: 300000\n",
+      "Iteration: 400000\n",
+      "Iteration: 500000\n",
+      "Iteration: 600000\n",
+      "Iteration: 700000\n",
+      "Iteration: 800000\n",
+      "Iteration: 900000\n",
+      "Iteration: 1000000\n",
+      "Iteration: 1100000\n",
+      "Iteration: 1200000\n",
+      "Iteration: 1300000\n",
+      "Iteration: 1400000\n",
+      "Iteration: 1500000\n",
+      "Iteration: 1600000\n",
+      "Iteration: 1700000\n",
+      "Iteration: 1800000\n",
+      "Iteration: 1900000\n",
+      "Iteration: 2000000\n",
+      "Iteration: 2100000\n",
+      "Iteration: 2200000\n",
+      "Iteration: 2300000\n",
+      "Iteration: 2400000\n",
+      "Iteration: 2500000\n",
+      "Iteration: 2600000\n",
+      "Iteration: 2700000\n",
+      "Iteration: 2800000\n",
+      "Iteration: 2900000\n",
+      "Iteration: 3000000\n",
+      "Iteration: 3100000\n",
+      "Iteration: 3200000\n",
+      "Iteration: 3300000\n",
+      "Iteration: 3400000\n",
+      "Iteration: 3500000\n",
+      "Iteration: 3600000\n",
+      "Iteration: 3700000\n",
+      "Iteration: 3800000\n",
+      "Iteration: 3900000\n",
+      "Iteration: 4000000\n",
+      "Iteration: 4100000\n",
+      "Iteration: 4200000\n",
+      "Iteration: 4300000\n",
+      "Iteration: 4400000\n",
+      "Iteration: 4500000\n",
+      "Iteration: 4600000\n",
+      "Iteration: 4700000\n",
+      "Iteration: 4800000\n",
+      "Iteration: 4900000\n",
+      "Iteration: 5000000\n",
+      "Iteration: 5100000\n",
+      "Iteration: 5200000\n",
+      "Iteration: 5300000\n",
+      "Iteration: 5400000\n",
+      "Iteration: 5500000\n",
+      "Iteration: 5600000\n",
+      "Iteration: 5700000\n",
+      "Iteration: 5800000\n",
+      "Iteration: 5900000\n",
+      "Iteration: 6000000\n",
+      "Iteration: 6100000\n",
+      "Iteration: 6200000\n",
+      "Iteration: 6300000\n",
+      "Iteration: 6400000\n",
+      "Iteration: 6500000\n",
+      "Iteration: 6600000\n",
+      "Iteration: 6700000\n",
+      "Iteration: 6800000\n",
+      "Iteration: 6900000\n",
+      "Iteration: 7000000\n",
+      "Iteration: 7100000\n",
+      "Iteration: 7200000\n",
+      "Iteration: 7300000\n",
+      "Iteration: 7400000\n",
+      "Iteration: 7500000\n",
+      "Iteration: 7600000\n",
+      "Iteration: 7700000\n",
+      "Iteration: 7800000\n",
+      "Iteration: 7900000\n",
+      "Iteration: 8000000\n",
+      "Iteration: 8100000\n",
+      "Iteration: 8200000\n",
+      "Iteration: 8300000\n",
+      "Iteration: 8400000\n",
+      "Iteration: 8500000\n",
+      "Iteration: 8600000\n",
+      "Iteration: 8700000\n",
+      "Iteration: 8800000\n",
+      "Iteration: 8900000\n",
+      "Iteration: 9000000\n",
+      "Iteration: 9100000\n",
+      "Iteration: 9200000\n",
+      "Iteration: 9300000\n",
+      "Iteration: 9400000\n",
+      "Iteration: 9500000\n",
+      "Iteration: 9600000\n",
+      "Iteration: 9700000\n",
+      "Iteration: 9800000\n",
+      "Iteration: 9900000\n",
+      "Iteration: 10000000\n",
+      "Iteration: 10100000\n",
+      "Iteration: 10200000\n",
+      "Iteration: 10300000\n",
+      "Iteration: 10400000\n",
+      "Iteration: 10500000\n",
+      "Iteration: 10600000\n",
+      "Iteration: 10700000\n",
+      "Iteration: 10800000\n",
+      "Iteration: 10900000\n",
+      "Iteration: 11000000\n",
+      "Iteration: 11100000\n",
+      "Iteration: 11200000\n",
+      "Iteration: 11300000\n",
+      "Iteration: 11400000\n",
+      "Iteration: 11500000\n",
+      "Iteration: 11600000\n",
+      "Iteration: 11700000\n",
+      "Iteration: 11800000\n",
+      "Iteration: 11900000\n",
+      "Iteration: 12000000\n",
+      "Iteration: 12100000\n",
+      "Iteration: 12200000\n",
+      "Iteration: 12300000\n",
+      "Iteration: 12400000\n",
+      "Iteration: 12500000\n",
+      "Iteration: 12600000\n",
+      "Iteration: 12700000\n",
+      "Iteration: 12800000\n",
+      "Iteration: 12900000\n",
+      "Iteration: 13000000\n",
+      "Iteration: 13100000\n",
+      "Iteration: 13200000\n",
+      "Iteration: 13300000\n",
+      "Iteration: 13400000\n",
+      "Iteration: 13500000\n",
+      "Iteration: 13600000\n",
+      "Iteration: 13700000\n",
+      "Iteration: 13800000\n",
+      "Iteration: 13900000\n",
+      "Iteration: 14000000\n",
+      "Iteration: 14100000\n",
+      "Iteration: 14200000\n",
+      "Iteration: 14300000\n",
+      "Iteration: 14400000\n",
+      "Iteration: 14500000\n",
+      "Iteration: 14600000\n",
+      "Iteration: 14700000\n",
+      "Iteration: 14800000\n",
+      "Iteration: 14900000\n",
+      "Iteration: 15000000\n",
+      "Iteration: 15100000\n",
+      "Iteration: 15200000\n",
+      "Iteration: 15300000\n",
+      "Iteration: 15400000\n",
+      "Iteration: 15500000\n",
+      "Iteration: 15600000\n",
+      "Iteration: 15700000\n",
+      "Iteration: 15800000\n",
+      "Iteration: 15900000\n",
+      "Iteration: 16000000\n",
+      "Iteration: 16100000\n",
+      "Iteration: 16200000\n",
+      "Iteration: 16300000\n",
+      "Iteration: 16400000\n",
+      "Iteration: 16500000\n",
+      "Iteration: 16600000\n",
+      "Iteration: 16700000\n",
+      "Iteration: 16800000\n",
+      "Iteration: 16900000\n",
+      "Iteration: 17000000\n",
+      "Iteration: 17100000\n",
+      "Iteration: 17200000\n",
+      "Iteration: 17300000\n",
+      "Iteration: 17400000\n",
+      "Iteration: 17500000\n",
+      "Iteration: 17600000\n",
+      "Iteration: 17700000\n",
+      "Iteration: 17800000\n",
+      "Iteration: 17900000\n",
+      "Iteration: 18000000\n",
+      "Iteration: 18100000\n",
+      "Iteration: 18200000\n",
+      "Iteration: 18300000\n",
+      "Iteration: 18400000\n",
+      "Iteration: 18500000\n",
+      "Iteration: 18600000\n",
+      "Iteration: 18700000\n",
+      "Iteration: 18800000\n",
+      "Iteration: 18900000\n",
+      "Iteration: 19000000\n",
+      "Iteration: 19100000\n",
+      "Iteration: 19200000\n",
+      "Iteration: 19300000\n",
+      "Iteration: 19400000\n",
+      "Iteration: 19500000\n",
+      "Iteration: 19600000\n",
+      "Iteration: 19700000\n",
+      "Iteration: 19800000\n",
+      "Iteration: 19900000\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "({('Mon', 0): {400: 1},\n",
+       "  ('Tue', 0): {300: 1},\n",
+       "  ('Tue', 100): {300: 1},\n",
+       "  ('Tue', 200): {200: 1},\n",
+       "  ('Tue', 300): {100: 1},\n",
+       "  ('Wed', 0): {400: 1},\n",
+       "  ('Wed', 100): {300: 1},\n",
+       "  ('Wed', 200): {200: 1},\n",
+       "  ('Wed', 300): {100: 1},\n",
+       "  ('Thu', 0): {300: 1},\n",
+       "  ('Thu', 100): {200: 1},\n",
+       "  ('Thu', 200): {100: 1},\n",
+       "  ('Thu', 300): {0: 1},\n",
+       "  ('Fri', 0): {200: 1},\n",
+       "  ('Fri', 100): {100: 1},\n",
+       "  ('Fri', 200): {0: 1},\n",
+       "  ('Fri', 300): {0: 1},\n",
+       "  ('Weekend', 0): {0: 1},\n",
+       "  ('Weekend', 100): {0: 1},\n",
+       "  ('Weekend', 200): {0: 1},\n",
+       "  ('Weekend', 300): {0: 1}},\n",
+       " {('Mon', 0): {0: 2225.749496682385,\n",
+       "   100: 2528.178263359892,\n",
+       "   200: 2752.245336408776,\n",
+       "   300: 2833.598086662411,\n",
+       "   400: 2865.5080973287336},\n",
+       "  ('Tue', 0): {0: 1627.1674196319675,\n",
+       "   100: 1926.185189822399,\n",
+       "   200: 2130.63971600556,\n",
+       "   300: 2235.794644930646,\n",
+       "   400: 2202.700921685597},\n",
+       "  ('Tue', 100): {0: 2323.642847941712,\n",
+       "   100: 2546.146008882256,\n",
+       "   200: 2622.2014944709003,\n",
+       "   300: 2704.7958165719538,\n",
+       "   400: 2254.9865435917945},\n",
+       "  ('Tue', 200): {0: 2938.47212708529,\n",
+       "   100: 2985.6763069672907,\n",
+       "   200: 3045.55602709444,\n",
+       "   300: 2660.793750889116,\n",
+       "   400: 2244.116679273476},\n",
+       "  ('Tue', 300): {0: 3397.5493842636856,\n",
+       "   100: 3431.1584693328227,\n",
+       "   200: 3047.963831661575,\n",
+       "   300: 2689.0905262507554,\n",
+       "   400: 2246.0842993310807},\n",
+       "  ('Wed', 0): {0: 991.8337704155527,\n",
+       "   100: 1294.3979155570473,\n",
+       "   200: 1499.2384682910836,\n",
+       "   300: 1560.5737610953374,\n",
+       "   400: 1656.9354742311311},\n",
+       "  ('Wed', 100): {0: 1693.4281528809047,\n",
+       "   100: 1890.5014859779849,\n",
+       "   200: 1967.5195056845337,\n",
+       "   300: 2030.2875396109434,\n",
+       "   400: 1624.7053132984788},\n",
+       "  ('Wed', 200): {0: 2307.350730239611,\n",
+       "   100: 2368.121947542663,\n",
+       "   200: 2439.4451003135055,\n",
+       "   300: 2028.5567501077871,\n",
+       "   400: 1602.4490693607893},\n",
+       "  ('Wed', 300): {0: 2766.5347359553866,\n",
+       "   100: 2817.6576462084945,\n",
+       "   200: 2397.5480427541106,\n",
+       "   300: 2028.9023931048234,\n",
+       "   400: 1610.7042344178608},\n",
+       "  ('Thu', 0): {0: 388.45762020693445,\n",
+       "   100: 689.9741046142422,\n",
+       "   200: 886.8292737374425,\n",
+       "   300: 1008.8462346115972,\n",
+       "   400: 970.599703355806},\n",
+       "  ('Thu', 100): {0: 1086.9408520148895,\n",
+       "   100: 1301.332777514599,\n",
+       "   200: 1405.1825937805977,\n",
+       "   300: 1348.6418726014172,\n",
+       "   400: 992.3726336890564},\n",
+       "  ('Thu', 200): {0: 1715.4114166813265,\n",
+       "   100: 1833.5195722683234,\n",
+       "   200: 1741.2757203880324,\n",
+       "   300: 1376.7551643904483,\n",
+       "   400: 957.9607707339657},\n",
+       "  ('Thu', 300): {0: 2190.19649451877,\n",
+       "   100: 2125.740810274669,\n",
+       "   200: 1776.8132567876999,\n",
+       "   300: 1408.5495730824664,\n",
+       "   400: 990.4018172404869},\n",
+       "  ('Fri', 0): {0: 0.0,\n",
+       "   100: 299.99999999999716,\n",
+       "   200: 406.0476593550646,\n",
+       "   300: 170.46122765548887,\n",
+       "   400: -153.64846976857817},\n",
+       "  ('Fri', 100): {0: 699.9999999999943,\n",
+       "   100: 842.0141106267022,\n",
+       "   200: 610.5115569281422,\n",
+       "   300: 292.160669622827,\n",
+       "   400: -113.12406224669776},\n",
+       "  ('Fri', 200): {0: 1172.1819744094662,\n",
+       "   100: 1070.907334160906,\n",
+       "   200: 687.7773470555264,\n",
+       "   300: 330.44001007014674,\n",
+       "   400: -75.1010831966216},\n",
+       "  ('Fri', 300): {0: 1427.1955441761681,\n",
+       "   100: 1007.1503466766485,\n",
+       "   200: 674.4671172275836,\n",
+       "   300: 278.1467797475504,\n",
+       "   400: -99.78074377598806},\n",
+       "  ('Weekend', 0): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       "  ('Weekend', 100): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       "  ('Weekend', 200): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0},\n",
+       "  ('Weekend', 300): {0: 0, 100: 0, 200: 0, 300: 0, 400: 0}})"
+      ]
+     },
+     "execution_count": 124,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "q_learning(foodtruck, 1, 0.1, 0.01, 20000000)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Q"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{('Mon', 0): 2880.0,\n",
+    " ('Tue', 0): 2250.0,\n",
+    " ('Tue', 100): 2650.0,\n",
+    " ('Tue', 200): 3050.0,\n",
+    " ('Tue', 300): 3450.0,\n",
+    " ('Wed', 0): 1620.0,\n",
+    " ('Wed', 100): 2020.0,\n",
+    " ('Wed', 200): 2420.0,\n",
+    " ('Wed', 300): 2820.0,\n",
+    " ('Thu', 0): 990.0,\n",
+    " ('Thu', 100): 1390.0,\n",
+    " ('Thu', 200): 1790.0,\n",
+    " ('Thu', 300): 2190.0,\n",
+    " ('Fri', 0): 390.00000000000006,\n",
+    " ('Fri', 100): 790.0000000000001,\n",
+    " ('Fri', 200): 1190.0,\n",
+    " ('Fri', 300): 1400.0,\n",
+    " ('Weekend', 0): 0,\n",
+    " ('Weekend', 100): 0,\n",
+    " ('Weekend', 200): 0,\n",
+    " ('Weekend', 300): 0}"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py37ml",
+   "language": "python",
+   "name": "py37ml"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}