-
Notifications
You must be signed in to change notification settings - Fork 0
/
epsilon_greedy_agent.py
69 lines (52 loc) · 1.78 KB
/
epsilon_greedy_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from Ten_Armed_Testbed import TenArmedTestbed
def argmax (q_values) :
top_value = float ('-inf')
ties = []
for i in range (len (q_values)) :
if q_values[i] > top_value :
ties.clear()
ties.append (i)
top_value = q_values[i]
elif q_values[i] == top_value :
ties.append (i)
return np.random.choice (ties)
class EpsilonGreedyAgent (object) :
def __init__ (self, arms, epsilon=0.1) :
self.q_values = np.zeros (arms) + 100
self.arm_count = np.zeros (arms)
self.last_action = 0
self.epsilon = 0.1
self.num_actions = arms
def agent_step (self, reward) :
self.arm_count[self.last_action] += 1
if np.random.random() < self.epsilon :
current_action = np.random.randint (0, self.num_actions)
else :
current_action = argmax (self.q_values)
self.q_values[self.last_action] += (1.0 / self.arm_count[self.last_action]) * (reward - self.q_values[self.last_action])
self.last_action = current_action
return current_action
NUM_RUNS = 500
NUM_STEPS = 1000
agent = EpsilonGreedyAgent (10, 0.5)
problem = TenArmedTestbed ()
all_averages = []
for run in tqdm (range (NUM_RUNS)) :
action = 0
scores = [0]
averages = []
for step in range (NUM_STEPS) :
current_reward = problem.reward (action)
action = agent.agent_step (current_reward)
scores.append(scores[-1] + current_reward)
averages.append(scores[-1] / (step + 1))
all_averages.append(averages)
problem.best_reward()
plt.plot (np.mean(all_averages, axis=0))
plt.xlabel ('STEPS')
plt.ylabel ("AVERAGE REWARD")
plt.show()