Skip to content

Commit 96ed8ec

Browse files
author
borninfreedom
committed
paper/alphazero
1 parent 0041760 commit 96ed8ec

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+12840
-156
lines changed
Binary file not shown.
File renamed without changes.
File renamed without changes.

Code/tensorflow/version1/.idea/misc.xml Code/tensorflow/DQN/.idea/misc.xml

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Code/tensorflow/version1/.idea/modules.xml Code/tensorflow/DQN/.idea/modules.xml

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
File renamed without changes.
File renamed without changes.

Code/tensorflow/DQN/README.md

Whitespace-only changes.

Code/tensorflow/DQN/RL_brain.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import numpy as np
2+
import pandas as pd
3+
import tensorflow.compat.v1 as tf
4+
tf.disable_v2_behavior()
5+
6+
np.random.seed(1)
7+
tf.set_random_seed(1)
8+
9+
10+
# Deep Q Network off-policy
11+
class DeepQNetwork:
12+
def __init__(
13+
self,
14+
n_actions,
15+
n_features,
16+
learning_rate=0.01,
17+
reward_decay=0.9,
18+
e_greedy=0.9,
19+
replace_target_iter=300,
20+
memory_size=500,
21+
batch_size=32,
22+
e_greedy_increment=None,
23+
output_graph=True,
24+
):
25+
self.n_actions = n_actions
26+
self.n_features = n_features
27+
self.lr = learning_rate
28+
self.gamma = reward_decay
29+
self.epsilon_max = e_greedy
30+
self.replace_target_iter = replace_target_iter
31+
self.memory_size = memory_size
32+
self.batch_size = batch_size
33+
self.epsilon_increment = e_greedy_increment
34+
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
35+
36+
# total learning step
37+
self.learn_step_counter = 0
38+
39+
# initialize zero memory [s, a, r, s_]
40+
self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
41+
42+
# consist of [target_net, evaluate_net]
43+
self._build_net()
44+
t_params = tf.get_collection('target_net_params')
45+
e_params = tf.get_collection('eval_net_params')
46+
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
47+
48+
self.sess = tf.Session()
49+
50+
if output_graph:
51+
# $ tensorboard --logdir=logs
52+
# tf.train.SummaryWriter soon be deprecated, use following
53+
tf.summary.FileWriter("logs/", self.sess.graph)
54+
55+
self.sess.run(tf.global_variables_initializer())
56+
self.cost_his = []
57+
58+
def _build_net(self):
59+
# ------------------ build evaluate_net ------------------
60+
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
61+
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
62+
with tf.variable_scope('eval_net'):
63+
# c_names(collections_names) are the collections to store variables
64+
c_names, n_l1, w_initializer, b_initializer = \
65+
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
66+
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
67+
68+
# first layer. collections is used later when assign to target net
69+
with tf.variable_scope('l1'):
70+
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
71+
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
72+
l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
73+
74+
# second layer. collections is used later when assign to target net
75+
with tf.variable_scope('l2'):
76+
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
77+
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
78+
self.q_eval = tf.matmul(l1, w2) + b2
79+
80+
with tf.variable_scope('loss'):
81+
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
82+
with tf.variable_scope('train'):
83+
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
84+
85+
# ------------------ build target_net ------------------
86+
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
87+
with tf.variable_scope('target_net'):
88+
# c_names(collections_names) are the collections to store variables
89+
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
90+
91+
# first layer. collections is used later when assign to target net
92+
with tf.variable_scope('l1'):
93+
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
94+
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
95+
l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
96+
97+
# second layer. collections is used later when assign to target net
98+
with tf.variable_scope('l2'):
99+
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
100+
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
101+
self.q_next = tf.matmul(l1, w2) + b2
102+
103+
def store_transition(self, s, a, r, s_):
104+
if not hasattr(self, 'memory_counter'):
105+
self.memory_counter = 0
106+
107+
transition = np.hstack((s, [a, r], s_))
108+
109+
# replace the old memory with new memory
110+
index = self.memory_counter % self.memory_size
111+
self.memory[index, :] = transition
112+
113+
self.memory_counter += 1
114+
115+
def choose_action(self, observation):
116+
# to have batch dimension when feed into tf placeholder
117+
observation = observation[np.newaxis, :]
118+
119+
if np.random.uniform() < self.epsilon:
120+
# forward feed the observation and get q value for every actions
121+
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
122+
action = np.argmax(actions_value)
123+
else:
124+
action = np.random.randint(0, self.n_actions)
125+
return action
126+
127+
def learn(self):
128+
# check to replace target parameters
129+
if self.learn_step_counter % self.replace_target_iter == 0:
130+
self.sess.run(self.replace_target_op)
131+
print('\ntarget_params_replaced\n')
132+
133+
# sample batch memory from all memory
134+
if self.memory_counter > self.memory_size:
135+
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
136+
else:
137+
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
138+
batch_memory = self.memory[sample_index, :]
139+
140+
q_next, q_eval = self.sess.run(
141+
[self.q_next, self.q_eval],
142+
feed_dict={
143+
self.s_: batch_memory[:, -self.n_features:], # fixed params
144+
self.s: batch_memory[:, :self.n_features], # newest params
145+
})
146+
147+
# change q_target w.r.t q_eval's action
148+
q_target = q_eval.copy()
149+
150+
batch_index = np.arange(self.batch_size, dtype=np.int32)
151+
eval_act_index = batch_memory[:, self.n_features].astype(int)
152+
reward = batch_memory[:, self.n_features + 1]
153+
154+
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
155+
156+
"""
157+
For example in this batch I have 2 samples and 3 actions:
158+
q_eval =
159+
[[1, 2, 3],
160+
[4, 5, 6]]
161+
q_target = q_eval =
162+
[[1, 2, 3],
163+
[4, 5, 6]]
164+
Then change q_target with the real q_target value w.r.t the q_eval's action.
165+
For example in:
166+
sample 0, I took action 0, and the max q_target value is -1;
167+
sample 1, I took action 2, and the max q_target value is -2:
168+
q_target =
169+
[[-1, 2, 3],
170+
[4, 5, -2]]
171+
So the (q_target - q_eval) becomes:
172+
[[(-1)-(1), 0, 0],
173+
[0, 0, (-2)-(6)]]
174+
We then backpropagate this error w.r.t the corresponding action to network,
175+
leave other action as error=0 cause we didn't choose it.
176+
"""
177+
178+
# train eval network
179+
_, self.cost = self.sess.run([self._train_op, self.loss],
180+
feed_dict={self.s: batch_memory[:, :self.n_features],
181+
self.q_target: q_target})
182+
self.cost_his.append(self.cost)
183+
184+
# increasing epsilon
185+
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
186+
self.learn_step_counter += 1
187+
188+
def plot_cost(self):
189+
import matplotlib.pyplot as plt
190+
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
191+
plt.ylabel('Cost')
192+
plt.xlabel('training steps')
193+
plt.show()
Binary file not shown.
Binary file not shown.
Binary file not shown.

Code/tensorflow/DQN/main.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from maze_env import Maze
2+
from RL_brain import DeepQNetwork
3+
4+
5+
def run_maze():
6+
step = 0
7+
for episode in range(300):
8+
# initial observation
9+
observation = env.reset()
10+
11+
while True:
12+
# fresh env
13+
env.render()
14+
15+
# RL choose action based on observation
16+
action = RL.choose_action(observation)
17+
18+
# RL take action and get next observation and reward
19+
observation_, reward, done = env.step(action)
20+
21+
RL.store_transition(observation, action, reward, observation_)
22+
23+
if (step > 200) and (step % 5 == 0):
24+
RL.learn()
25+
26+
# swap observation
27+
observation = observation_
28+
29+
# break while loop when end of this episode
30+
if done:
31+
break
32+
step += 1
33+
34+
# end of game
35+
print('game over')
36+
env.destroy()
37+
38+
39+
if __name__ == "__main__":
40+
# maze game
41+
env = Maze()
42+
RL = DeepQNetwork(env.n_actions, env.n_features,
43+
learning_rate=0.01,
44+
reward_decay=0.9,
45+
e_greedy=0.9,
46+
replace_target_iter=200,
47+
memory_size=2000,
48+
# output_graph=True
49+
)
50+
env.after(100, run_maze)
51+
env.mainloop()
52+
RL.plot_cost()

Code/tensorflow/DQN/maze_env.py

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import numpy as np
2+
import time
3+
import sys
4+
if sys.version_info.major == 2:
5+
import Tkinter as tk
6+
else:
7+
import tkinter as tk
8+
9+
UNIT = 40 # pixels
10+
MAZE_H = 4 # grid height
11+
MAZE_W = 4 # grid width
12+
13+
14+
class Maze(tk.Tk, object):
15+
def __init__(self):
16+
super(Maze, self).__init__()
17+
self.action_space = ['u', 'd', 'l', 'r']
18+
self.n_actions = len(self.action_space)
19+
self.n_features = 2
20+
self.title('maze')
21+
self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
22+
self._build_maze()
23+
24+
def _build_maze(self):
25+
self.canvas = tk.Canvas(self, bg='white',
26+
height=MAZE_H * UNIT,
27+
width=MAZE_W * UNIT)
28+
29+
# create grids
30+
for c in range(0, MAZE_W * UNIT, UNIT):
31+
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
32+
self.canvas.create_line(x0, y0, x1, y1)
33+
for r in range(0, MAZE_H * UNIT, UNIT):
34+
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
35+
self.canvas.create_line(x0, y0, x1, y1)
36+
37+
# create origin
38+
origin = np.array([20, 20])
39+
40+
# hell
41+
hell1_center = origin + np.array([UNIT * 2, UNIT])
42+
self.hell1 = self.canvas.create_rectangle(
43+
hell1_center[0] - 15, hell1_center[1] - 15,
44+
hell1_center[0] + 15, hell1_center[1] + 15,
45+
fill='black')
46+
# hell
47+
# hell2_center = origin + np.array([UNIT, UNIT * 2])
48+
# self.hell2 = self.canvas.create_rectangle(
49+
# hell2_center[0] - 15, hell2_center[1] - 15,
50+
# hell2_center[0] + 15, hell2_center[1] + 15,
51+
# fill='black')
52+
53+
# create oval
54+
oval_center = origin + UNIT * 2
55+
self.oval = self.canvas.create_oval(
56+
oval_center[0] - 15, oval_center[1] - 15,
57+
oval_center[0] + 15, oval_center[1] + 15,
58+
fill='yellow')
59+
60+
# create red rect
61+
self.rect = self.canvas.create_rectangle(
62+
origin[0] - 15, origin[1] - 15,
63+
origin[0] + 15, origin[1] + 15,
64+
fill='red')
65+
66+
# pack all
67+
self.canvas.pack()
68+
69+
def reset(self):
70+
self.update()
71+
time.sleep(0.1)
72+
self.canvas.delete(self.rect)
73+
origin = np.array([20, 20])
74+
self.rect = self.canvas.create_rectangle(
75+
origin[0] - 15, origin[1] - 15,
76+
origin[0] + 15, origin[1] + 15,
77+
fill='red')
78+
# return observation
79+
return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
80+
81+
def step(self, action):
82+
s = self.canvas.coords(self.rect)
83+
base_action = np.array([0, 0])
84+
if action == 0: # up
85+
if s[1] > UNIT:
86+
base_action[1] -= UNIT
87+
elif action == 1: # down
88+
if s[1] < (MAZE_H - 1) * UNIT:
89+
base_action[1] += UNIT
90+
elif action == 2: # right
91+
if s[0] < (MAZE_W - 1) * UNIT:
92+
base_action[0] += UNIT
93+
elif action == 3: # left
94+
if s[0] > UNIT:
95+
base_action[0] -= UNIT
96+
97+
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
98+
99+
next_coords = self.canvas.coords(self.rect) # next state
100+
101+
# reward function
102+
if next_coords == self.canvas.coords(self.oval):
103+
reward = 1
104+
done = True
105+
elif next_coords in [self.canvas.coords(self.hell1)]:
106+
reward = -1
107+
done = True
108+
else:
109+
reward = 0
110+
done = False
111+
s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
112+
return s_, reward, done
113+
114+
def render(self):
115+
# time.sleep(0.01)
116+
self.update()

Code/tensorflow/DQN/requirements.txt

Whitespace-only changes.

Code/tensorflow/RL/.idea/.gitignore

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)