Skip to content

Commit 5b59c20

Browse files
committed
update code
1 parent 447885e commit 5b59c20

File tree

10 files changed

+343
-641
lines changed

10 files changed

+343
-641
lines changed

Reinforcement_learning_TUT/5_Deep_Q_Network/RL_brain.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
This part of code is the Q learning brain, which is a brain of the agent.
33
All decisions are made in here.
4+
Using Tensorflow to build the neural network.
45
56
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
67
"""
@@ -197,8 +198,8 @@ def learn(self):
197198
198199
Then change q_target with the real q_target value w.r.t the q_eval's action.
199200
For example in:
200-
sample 0, I took action 0, and the q_target value is -1;
201-
sample 1, I took action 2, and the q_target value is -2:
201+
sample 0, I took action 0, and the max q_target value is -1;
202+
sample 1, I took action 2, and the max q_target value is -2:
202203
q_target =
203204
[[-1, 2, 3],
204205
[4, 5, -2]]
@@ -221,6 +222,7 @@ def learn(self):
221222
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
222223
self.learn_step_counter += 1
223224

225+
224226
def plot_cost(self):
225227
import matplotlib.pyplot as plt
226228
plt.plot(np.arange(len(self.cost_his)), self.cost_his)

Reinforcement_learning_TUT/6_OpenAI_gym/RL_brain.py

+94-90
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
This part of code is the Q learning brain, which is a brain of the agent.
33
All decisions are made in here.
4+
Using Tensorflow to build the neural network.
45
56
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
67
"""
@@ -60,6 +61,90 @@ def __init__(
6061
self.sess.run(tf.global_variables_initializer())
6162
self.cost_his = []
6263

64+
def _build_net(self):
65+
# create eval and target net weights and biases separately
66+
self._eval_net_params = []
67+
self._target_net_params = []
68+
69+
# build evaluate_net
70+
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
71+
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
72+
with tf.variable_scope('eval_net'):
73+
self.q_eval = self._build_layers(self.s, self.n_actions, trainable=True)
74+
with tf.name_scope('loss'):
75+
self.loss = tf.reduce_sum(tf.square(self.q_target - self.q_eval))
76+
with tf.name_scope('train'):
77+
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
78+
79+
# build target_net
80+
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')
81+
with tf.variable_scope('target_net'):
82+
self.q_next = self._build_layers(self.s_, self.n_actions, trainable=False)
83+
84+
def _build_layers(self, inputs, action_size, trainable):
85+
layers_output = [inputs]
86+
for i, n_unit in enumerate(self.hidden_layers):
87+
with tf.variable_scope('layer%i' % i):
88+
output = self._add_layer(
89+
layers_output[i],
90+
in_size=layers_output[i].get_shape()[1].value,
91+
out_size=n_unit,
92+
activation_function=tf.nn.relu,
93+
trainable=trainable,
94+
)
95+
layers_output.append(output)
96+
with tf.variable_scope('output_layer'):
97+
output = self._add_layer(
98+
layers_output[-1],
99+
in_size=layers_output[-1].get_shape()[1].value,
100+
out_size=action_size,
101+
activation_function=None,
102+
trainable=trainable
103+
)
104+
return output
105+
106+
def _add_layer(self, inputs, in_size, out_size, activation_function=None, trainable=True):
107+
# create weights and biases
108+
Weights = tf.get_variable(
109+
name='weights',
110+
shape=[in_size, out_size],
111+
trainable=trainable,
112+
initializer=tf.truncated_normal_initializer(mean=0., stddev=0.3)
113+
)
114+
biases = tf.get_variable(
115+
name='biases',
116+
shape=[out_size],
117+
initializer=tf.constant_initializer(0.1),
118+
trainable=trainable
119+
)
120+
121+
# record parameters
122+
if trainable is True:
123+
self._eval_net_params.append([Weights, biases])
124+
else:
125+
self._target_net_params.append([Weights, biases])
126+
127+
Wx_plus_b = tf.matmul(inputs, Weights) + biases
128+
129+
# activation function
130+
if activation_function is None:
131+
outputs = Wx_plus_b
132+
else:
133+
outputs = activation_function(Wx_plus_b)
134+
return outputs
135+
136+
def store_transition(self, s, a, r, s_):
137+
if not hasattr(self, 'memory_counter'):
138+
self.memory_counter = 0
139+
140+
transition = np.hstack((s, [a, r], s_))
141+
142+
# replace the old memory with new memory
143+
index = self.memory_counter % self.memory_size
144+
self.memory.iloc[index, :] = transition
145+
146+
self.memory_counter += 1
147+
63148
def choose_action(self, observation):
64149
# to have batch dimension when feed into tf placeholder
65150
observation = observation[np.newaxis, :]
@@ -72,6 +157,13 @@ def choose_action(self, observation):
72157
action = np.random.randint(0, self.n_actions)
73158
return action
74159

160+
def _replace_target_params(self):
161+
replace_ops = []
162+
for layer, params in enumerate(self._eval_net_params):
163+
replace_op = [tf.assign(self._target_net_params[layer][W_b], params[W_b]) for W_b in range(2)]
164+
replace_ops.append(replace_op)
165+
self.sess.run(replace_ops)
166+
75167
def learn(self):
76168
# check to replace target parameters
77169
if self.learn_step_counter % self.replace_target_iter == 0:
@@ -106,8 +198,8 @@ def learn(self):
106198
107199
Then change q_target with the real q_target value w.r.t the q_eval's action.
108200
For example in:
109-
sample 0, I took action 0, and the max q_target value is -1;
110-
sample 1, I took action 2, and the max q_target value is -2:
201+
sample 0, I took action 0, and the q_target value is -1;
202+
sample 1, I took action 2, and the q_target value is -2:
111203
q_target =
112204
[[-1, 2, 3],
113205
[4, 5, -2]]
@@ -130,98 +222,10 @@ def learn(self):
130222
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
131223
self.learn_step_counter += 1
132224

133-
def store_transition(self, s, a, r, s_):
134-
if not hasattr(self, 'memory_counter'):
135-
self.memory_counter = 0
136-
137-
transition = np.hstack((s, [a, r], s_))
138-
139-
# replace the old memory with new memory
140-
index = self.memory_counter % self.memory_size
141-
self.memory.iloc[index, :] = transition
142-
143-
self.memory_counter += 1
144-
145225
def plot_cost(self):
146226
import matplotlib.pyplot as plt
147227
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
148228
plt.show()
149229

150-
def _replace_target_params(self):
151-
replace_ops = []
152-
for layer, params in enumerate(self._eval_net_params):
153-
replace_op = [tf.assign(self._target_net_params[layer][W_b], params[W_b]) for W_b in range(2)]
154-
replace_ops.append(replace_op)
155-
self.sess.run(replace_ops)
156230

157-
def _build_net(self):
158-
# create eval and target net weights and biases separately
159-
self._eval_net_params = []
160-
self._target_net_params = []
161231

162-
# build evaluate_net
163-
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
164-
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
165-
with tf.variable_scope('eval_net'):
166-
self.q_eval = self._build_layers(self.s, self.n_actions, trainable=True)
167-
with tf.name_scope('loss'):
168-
self.loss = tf.reduce_sum(tf.square(self.q_target - self.q_eval))
169-
with tf.name_scope('train'):
170-
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
171-
172-
# build target_net
173-
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')
174-
with tf.variable_scope('target_net'):
175-
self.q_next = self._build_layers(self.s_, self.n_actions, trainable=False)
176-
177-
def _build_layers(self, inputs, action_size, trainable):
178-
layers_output = [inputs]
179-
for i, n_unit in enumerate(self.hidden_layers):
180-
with tf.variable_scope('layer%i' % i):
181-
output = self._add_layer(
182-
layers_output[i],
183-
in_size=layers_output[i].get_shape()[1].value,
184-
out_size=n_unit,
185-
activation_function=tf.nn.relu,
186-
trainable=trainable,
187-
)
188-
layers_output.append(output)
189-
with tf.variable_scope('output_layer'):
190-
output = self._add_layer(
191-
layers_output[-1],
192-
in_size=layers_output[-1].get_shape()[1].value,
193-
out_size=action_size,
194-
activation_function=None,
195-
trainable=trainable
196-
)
197-
return output
198-
199-
def _add_layer(self, inputs, in_size, out_size, activation_function=None, trainable=True):
200-
# create weights and biases
201-
Weights = tf.get_variable(
202-
name='weights',
203-
shape=[in_size, out_size],
204-
trainable=trainable,
205-
initializer=tf.truncated_normal_initializer(mean=0., stddev=0.3)
206-
)
207-
biases = tf.get_variable(
208-
name='biases',
209-
shape=[out_size],
210-
initializer=tf.constant_initializer(0.1),
211-
trainable=trainable
212-
)
213-
214-
# record parameters
215-
if trainable is True:
216-
self._eval_net_params.append([Weights, biases])
217-
else:
218-
self._target_net_params.append([Weights, biases])
219-
220-
Wx_plus_b = tf.matmul(inputs, Weights) + biases
221-
222-
# activation function
223-
if activation_function is None:
224-
outputs = Wx_plus_b
225-
else:
226-
outputs = activation_function(Wx_plus_b)
227-
return outputs

Reinforcement_learning_TUT/6_OpenAI_gym/run_CartPole.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import gym
99
from RL_brain import DeepQNetwork
10-
import time
1110

1211
env = gym.make('CartPole-v0')
1312
print(env.action_space)
@@ -17,7 +16,7 @@
1716

1817
RL = DeepQNetwork(n_actions=env.action_space.n,
1918
n_features=len(env.observation_space.high),
20-
learning_rate=0.01, e_greedy=0.99,
19+
learning_rate=0.01, e_greedy=0.9,
2120
replace_target_iter=100, memory_size=2000,
2221
e_greedy_increment=0.001,
2322
hidden_layers=[20, 20],)
@@ -40,8 +39,8 @@
4039

4140
# the smaller theta and closer to center the better
4241

43-
r1 = (env.x_threshold - abs(x))/env.x_threshold - 2
44-
r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians
42+
r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
43+
r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
4544
reward = r1 + r2
4645

4746
RL.store_transition(observation, action, reward, observation_)

0 commit comments

Comments
 (0)