1
1
"""
2
2
This part of code is the Q learning brain, which is a brain of the agent.
3
3
All decisions are made in here.
4
+ Using Tensorflow to build the neural network.
4
5
5
6
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
6
7
"""
@@ -60,6 +61,90 @@ def __init__(
60
61
self .sess .run (tf .global_variables_initializer ())
61
62
self .cost_his = []
62
63
64
+ def _build_net (self ):
65
+ # create eval and target net weights and biases separately
66
+ self ._eval_net_params = []
67
+ self ._target_net_params = []
68
+
69
+ # build evaluate_net
70
+ self .s = tf .placeholder (tf .float32 , [None , self .n_features ], name = 's' )
71
+ self .q_target = tf .placeholder (tf .float32 , [None , self .n_actions ], name = 'Q_target' )
72
+ with tf .variable_scope ('eval_net' ):
73
+ self .q_eval = self ._build_layers (self .s , self .n_actions , trainable = True )
74
+ with tf .name_scope ('loss' ):
75
+ self .loss = tf .reduce_sum (tf .square (self .q_target - self .q_eval ))
76
+ with tf .name_scope ('train' ):
77
+ self ._train_op = tf .train .RMSPropOptimizer (self .lr ).minimize (self .loss )
78
+
79
+ # build target_net
80
+ self .s_ = tf .placeholder (tf .float32 , [None , self .n_features ], name = 's_' )
81
+ with tf .variable_scope ('target_net' ):
82
+ self .q_next = self ._build_layers (self .s_ , self .n_actions , trainable = False )
83
+
84
+ def _build_layers (self , inputs , action_size , trainable ):
85
+ layers_output = [inputs ]
86
+ for i , n_unit in enumerate (self .hidden_layers ):
87
+ with tf .variable_scope ('layer%i' % i ):
88
+ output = self ._add_layer (
89
+ layers_output [i ],
90
+ in_size = layers_output [i ].get_shape ()[1 ].value ,
91
+ out_size = n_unit ,
92
+ activation_function = tf .nn .relu ,
93
+ trainable = trainable ,
94
+ )
95
+ layers_output .append (output )
96
+ with tf .variable_scope ('output_layer' ):
97
+ output = self ._add_layer (
98
+ layers_output [- 1 ],
99
+ in_size = layers_output [- 1 ].get_shape ()[1 ].value ,
100
+ out_size = action_size ,
101
+ activation_function = None ,
102
+ trainable = trainable
103
+ )
104
+ return output
105
+
106
+ def _add_layer (self , inputs , in_size , out_size , activation_function = None , trainable = True ):
107
+ # create weights and biases
108
+ Weights = tf .get_variable (
109
+ name = 'weights' ,
110
+ shape = [in_size , out_size ],
111
+ trainable = trainable ,
112
+ initializer = tf .truncated_normal_initializer (mean = 0. , stddev = 0.3 )
113
+ )
114
+ biases = tf .get_variable (
115
+ name = 'biases' ,
116
+ shape = [out_size ],
117
+ initializer = tf .constant_initializer (0.1 ),
118
+ trainable = trainable
119
+ )
120
+
121
+ # record parameters
122
+ if trainable is True :
123
+ self ._eval_net_params .append ([Weights , biases ])
124
+ else :
125
+ self ._target_net_params .append ([Weights , biases ])
126
+
127
+ Wx_plus_b = tf .matmul (inputs , Weights ) + biases
128
+
129
+ # activation function
130
+ if activation_function is None :
131
+ outputs = Wx_plus_b
132
+ else :
133
+ outputs = activation_function (Wx_plus_b )
134
+ return outputs
135
+
136
+ def store_transition (self , s , a , r , s_ ):
137
+ if not hasattr (self , 'memory_counter' ):
138
+ self .memory_counter = 0
139
+
140
+ transition = np .hstack ((s , [a , r ], s_ ))
141
+
142
+ # replace the old memory with new memory
143
+ index = self .memory_counter % self .memory_size
144
+ self .memory .iloc [index , :] = transition
145
+
146
+ self .memory_counter += 1
147
+
63
148
def choose_action (self , observation ):
64
149
# to have batch dimension when feed into tf placeholder
65
150
observation = observation [np .newaxis , :]
@@ -72,6 +157,13 @@ def choose_action(self, observation):
72
157
action = np .random .randint (0 , self .n_actions )
73
158
return action
74
159
160
+ def _replace_target_params (self ):
161
+ replace_ops = []
162
+ for layer , params in enumerate (self ._eval_net_params ):
163
+ replace_op = [tf .assign (self ._target_net_params [layer ][W_b ], params [W_b ]) for W_b in range (2 )]
164
+ replace_ops .append (replace_op )
165
+ self .sess .run (replace_ops )
166
+
75
167
def learn (self ):
76
168
# check to replace target parameters
77
169
if self .learn_step_counter % self .replace_target_iter == 0 :
@@ -106,8 +198,8 @@ def learn(self):
106
198
107
199
Then change q_target with the real q_target value w.r.t the q_eval's action.
108
200
For example in:
109
- sample 0, I took action 0, and the max q_target value is -1;
110
- sample 1, I took action 2, and the max q_target value is -2:
201
+ sample 0, I took action 0, and the q_target value is -1;
202
+ sample 1, I took action 2, and the q_target value is -2:
111
203
q_target =
112
204
[[-1, 2, 3],
113
205
[4, 5, -2]]
@@ -130,98 +222,10 @@ def learn(self):
130
222
self .epsilon = self .epsilon + self .epsilon_increment if self .epsilon < self .epsilon_max else self .epsilon_max
131
223
self .learn_step_counter += 1
132
224
133
- def store_transition (self , s , a , r , s_ ):
134
- if not hasattr (self , 'memory_counter' ):
135
- self .memory_counter = 0
136
-
137
- transition = np .hstack ((s , [a , r ], s_ ))
138
-
139
- # replace the old memory with new memory
140
- index = self .memory_counter % self .memory_size
141
- self .memory .iloc [index , :] = transition
142
-
143
- self .memory_counter += 1
144
-
145
225
def plot_cost (self ):
146
226
import matplotlib .pyplot as plt
147
227
plt .plot (np .arange (len (self .cost_his )), self .cost_his )
148
228
plt .show ()
149
229
150
- def _replace_target_params (self ):
151
- replace_ops = []
152
- for layer , params in enumerate (self ._eval_net_params ):
153
- replace_op = [tf .assign (self ._target_net_params [layer ][W_b ], params [W_b ]) for W_b in range (2 )]
154
- replace_ops .append (replace_op )
155
- self .sess .run (replace_ops )
156
230
157
- def _build_net (self ):
158
- # create eval and target net weights and biases separately
159
- self ._eval_net_params = []
160
- self ._target_net_params = []
161
231
162
- # build evaluate_net
163
- self .s = tf .placeholder (tf .float32 , [None , self .n_features ], name = 's' )
164
- self .q_target = tf .placeholder (tf .float32 , [None , self .n_actions ], name = 'Q_target' )
165
- with tf .variable_scope ('eval_net' ):
166
- self .q_eval = self ._build_layers (self .s , self .n_actions , trainable = True )
167
- with tf .name_scope ('loss' ):
168
- self .loss = tf .reduce_sum (tf .square (self .q_target - self .q_eval ))
169
- with tf .name_scope ('train' ):
170
- self ._train_op = tf .train .RMSPropOptimizer (self .lr ).minimize (self .loss )
171
-
172
- # build target_net
173
- self .s_ = tf .placeholder (tf .float32 , [None , self .n_features ], name = 's_' )
174
- with tf .variable_scope ('target_net' ):
175
- self .q_next = self ._build_layers (self .s_ , self .n_actions , trainable = False )
176
-
177
- def _build_layers (self , inputs , action_size , trainable ):
178
- layers_output = [inputs ]
179
- for i , n_unit in enumerate (self .hidden_layers ):
180
- with tf .variable_scope ('layer%i' % i ):
181
- output = self ._add_layer (
182
- layers_output [i ],
183
- in_size = layers_output [i ].get_shape ()[1 ].value ,
184
- out_size = n_unit ,
185
- activation_function = tf .nn .relu ,
186
- trainable = trainable ,
187
- )
188
- layers_output .append (output )
189
- with tf .variable_scope ('output_layer' ):
190
- output = self ._add_layer (
191
- layers_output [- 1 ],
192
- in_size = layers_output [- 1 ].get_shape ()[1 ].value ,
193
- out_size = action_size ,
194
- activation_function = None ,
195
- trainable = trainable
196
- )
197
- return output
198
-
199
- def _add_layer (self , inputs , in_size , out_size , activation_function = None , trainable = True ):
200
- # create weights and biases
201
- Weights = tf .get_variable (
202
- name = 'weights' ,
203
- shape = [in_size , out_size ],
204
- trainable = trainable ,
205
- initializer = tf .truncated_normal_initializer (mean = 0. , stddev = 0.3 )
206
- )
207
- biases = tf .get_variable (
208
- name = 'biases' ,
209
- shape = [out_size ],
210
- initializer = tf .constant_initializer (0.1 ),
211
- trainable = trainable
212
- )
213
-
214
- # record parameters
215
- if trainable is True :
216
- self ._eval_net_params .append ([Weights , biases ])
217
- else :
218
- self ._target_net_params .append ([Weights , biases ])
219
-
220
- Wx_plus_b = tf .matmul (inputs , Weights ) + biases
221
-
222
- # activation function
223
- if activation_function is None :
224
- outputs = Wx_plus_b
225
- else :
226
- outputs = activation_function (Wx_plus_b )
227
- return outputs
0 commit comments