-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02b_q_learning_deep_net.py
111 lines (92 loc) · 3.72 KB
/
02b_q_learning_deep_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Based on code from https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0
import gym
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import util.eval
class DeepNet(nn.Module):
def __init__(self, n_input, lr):
super(DeepNet, self).__init__()
self.n_input = n_input
self.lr = lr
self.fc_h1 = nn.Linear(n_input, 16, bias=False)
torch.nn.init.uniform_(self.fc_h1.weight, 0., 0.) # TODO
self.fc_out = nn.Linear(16, 4, bias=False)
torch.nn.init.uniform_(self.fc_out.weight, 0., 0.) # TODO
self.optimizer = optim.SGD(self.parameters(), lr=lr)
def forward(self, x):
x = F.relu(self.fc_h1(x))
x = self.fc_out(x)
return x
def int_to_onehot(x, dim):
x_onehot = torch.zeros([1, dim])
x_onehot[0,x] = 1.
return x_onehot
env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake8x8-v0')
t0 = time.time()
# TODO Set learning parameters
lr = 0.0 # TODO
gamma = 0.95
eps_schedule = lambda x: 500./(float(x)+2000.)
max_steps_per_episode = 99
num_episodes = 10000
net = DeepNet(env.observation_space.n, lr)
returns_list = []
for episode in range(num_episodes):
#Reset environment and get first new observation
obs = env.reset()
cumulative_reward = 0
terminal = False
terminate_episode = False
# Q-learning with function approximation
for step in range(max_steps_per_episode):
# convert the action to a form the network can process
obs_onehot = int_to_onehot(obs, env.observation_space.n)
pred = net(obs_onehot)
# Action selection with exploration using added noise
# act = np.argmax(pred.detach().numpy() + np.random.randn(1,env.action_space.n)*(1./(episode+1)))
# ALTERNATIVE: epsilon-greedy exploration
if np.random.rand() < eps_schedule(episode):
act = env.action_space.sample()
else:
act = np.argmax(pred.detach().numpy())
# Preparing the target depends on the step being terminal or not
if not terminal:
# Get new observation and reward from the environment
obs_new, reward, terminal,_ = env.step(act)
# Process the new observation with the net
obs_new_onehot = int_to_onehot(obs_new, env.observation_space.n)
pred_new = net(obs_new_onehot).detach()
# Prepare the target for the Q-function update
max_q_new = torch.max(pred_new, dim=1)[0]
target = reward + gamma*max_q_new
# update the cumulative reward, prepare the observation for the next step
cumulative_reward += reward
obs = obs_new
else:
target=0.
terminate_episode = True
predicted = pred[:,act]
loss = (predicted - target).pow(2).sum()
net.optimizer.zero_grad()
loss.backward()
net.optimizer.step()
if terminate_episode:
break
if time.time() - t0 > 1:
num_rwrds = min(100,len(returns_list)-1)
print('Episode', episode, 'Smoothed average return', sum(returns_list[-num_rwrds:])/num_rwrds)
t0 = time.time()
returns_list.append(cumulative_reward)
print("Smoothed training reward", np.mean(np.reshape(np.array(returns_list), [-1,250]), axis=1))
print('Evaluating the learned policy')
def policy(obs):
obs_onehot = int_to_onehot(obs, env.observation_space.n)
pred = net(obs_onehot)
return np.argmax(pred.detach().numpy())
avg_test_return = util.eval.eval_agent(policy, env, num_episodes=10000, max_steps_per_episode=100)
print("Avg eval return: ", avg_test_return)