7
7
np .random .seed (1 )
8
8
tf .set_random_seed (1 )
9
9
10
+ MAX_EPISODES = 2000
11
+ LR_A = 0.0002 # learning rate for actor
12
+ LR_C = 0.0002 # learning rate for critic
13
+ GAMMA = 0.9999 # reward discount
14
+ REPLACE_ITER_A = 1700
15
+ REPLACE_ITER_C = 1500
16
+ MEMORY_CAPACITY = 200000
17
+ BATCH_SIZE = 32
18
+ DISPLAY_THRESHOLD = 100 # display until the running reward > 100
19
+ DATA_PATH = './data'
20
+ LOAD_MODEL = False
21
+ SAVE_MODEL_ITER = 50000
22
+ RENDER = False
23
+ OUTPUT_GRAPH = False
24
+ ENV_NAME = 'BipedalWalker-v2'
25
+
26
+ GLOBAL_STEP = tf .Variable (0 , trainable = False )
27
+ INCREASE_GS = GLOBAL_STEP .assign (tf .add (GLOBAL_STEP , 1 ))
28
+ LR_A = tf .train .exponential_decay (LR_A , GLOBAL_STEP , 10000 , .97 , staircase = True )
29
+ LR_C = tf .train .exponential_decay (LR_C , GLOBAL_STEP , 10000 , .97 , staircase = True )
30
+ END_POINT = (200 - 10 ) * (14 / 30 ) # from game
31
+
32
+ env = gym .make (ENV_NAME )
33
+ env .seed (1 )
34
+
35
+ STATE_DIM = env .observation_space .shape [0 ] # 24
36
+ ACTION_DIM = env .action_space .shape [0 ] # 4
37
+ ACTION_BOUND = env .action_space .high # [1, 1, 1, 1]
38
+
39
+ # all placeholder for tf
40
+ with tf .name_scope ('S' ):
41
+ S = tf .placeholder (tf .float32 , shape = [None , STATE_DIM ], name = 's' )
42
+ with tf .name_scope ('A' ):
43
+ A = tf .placeholder (tf .float32 , shape = [None , ACTION_DIM ], name = 'a' )
44
+ with tf .name_scope ('R' ):
45
+ R = tf .placeholder (tf .float32 , [None , 1 ], name = 'r' )
46
+ with tf .name_scope ('S_' ):
47
+ S_ = tf .placeholder (tf .float32 , shape = [None , STATE_DIM ], name = 's_' )
10
48
11
49
############################### Actor ####################################
12
50
@@ -31,14 +69,13 @@ def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter
31
69
32
70
def _build_net (self , s , scope , trainable ):
33
71
with tf .variable_scope (scope ):
34
- init_w = tf .random_normal_initializer (0. , 0.1 )
35
- init_b = tf .constant_initializer (0.1 )
36
- net = tf .layers .dense (s , 400 , activation = tf .nn .relu ,
37
- kernel_initializer = init_w , bias_initializer = init_b , name = 'l1' ,
38
- trainable = trainable )
39
- net = tf .layers .dense (net , 20 , activation = tf .nn .relu ,
40
- kernel_initializer = init_w , bias_initializer = init_b , name = 'l2' ,
41
- trainable = trainable )
72
+ init_w = tf .random_normal_initializer (0. , 0.01 )
73
+ init_b = tf .constant_initializer (0.01 )
74
+ net = tf .layers .dense (s , 500 , activation = tf .nn .relu ,
75
+ kernel_initializer = init_w , bias_initializer = init_b , name = 'l1' , trainable = trainable )
76
+ net = tf .layers .dense (net , 200 , activation = tf .nn .relu ,
77
+ kernel_initializer = init_w , bias_initializer = init_b , name = 'l2' , trainable = trainable )
78
+
42
79
with tf .variable_scope ('a' ):
43
80
actions = tf .layers .dense (net , self .a_dim , activation = tf .nn .tanh , kernel_initializer = init_w ,
44
81
bias_initializer = init_b , name = 'a' , trainable = trainable )
@@ -107,19 +144,19 @@ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_
107
144
108
145
def _build_net (self , s , a , scope , trainable ):
109
146
with tf .variable_scope (scope ):
110
- init_w = tf .random_normal_initializer (0. , 0.1 )
111
- init_b = tf .constant_initializer (0.1 )
147
+ init_w = tf .random_normal_initializer (0. , 0.01 )
148
+ init_b = tf .constant_initializer (0.01 )
112
149
113
150
with tf .variable_scope ('l1' ):
114
- n_l1 = 400
151
+ n_l1 = 700
152
+ # combine the action and states together in this way
115
153
w1_s = tf .get_variable ('w1_s' , [self .s_dim , n_l1 ], initializer = init_w , trainable = trainable )
116
154
w1_a = tf .get_variable ('w1_a' , [self .a_dim , n_l1 ], initializer = init_w , trainable = trainable )
117
155
b1 = tf .get_variable ('b1' , [1 , n_l1 ], initializer = init_b , trainable = trainable )
118
156
net = tf .nn .relu (tf .matmul (s , w1_s ) + tf .matmul (a , w1_a ) + b1 )
119
157
with tf .variable_scope ('l2' ):
120
158
net = tf .layers .dense (net , 20 , activation = tf .nn .relu , kernel_initializer = init_w ,
121
159
bias_initializer = init_b , name = 'l2' , trainable = trainable )
122
-
123
160
with tf .variable_scope ('q' ):
124
161
q = tf .layers .dense (net , 1 , kernel_initializer = init_w , bias_initializer = init_b , trainable = trainable ) # Q(s,a)
125
162
return q
@@ -217,7 +254,7 @@ class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
217
254
epsilon = 0.001 # small amount to avoid zero priority
218
255
alpha = 0.6 # [0~1] convert the importance of TD error to priority
219
256
beta = 0.4 # importance-sampling, from initial value increasing to 1
220
- beta_increment_per_sampling = 1e-4 # annealing the bias
257
+ beta_increment_per_sampling = 1e-5 # annealing the bias
221
258
abs_err_upper = 1 # for stability refer to paper
222
259
223
260
def __init__ (self , capacity ):
@@ -268,48 +305,11 @@ def _get_priority(self, error):
268
305
return np .power (clipped_error , self .alpha )
269
306
270
307
271
- MAX_EPISODES = 2000
272
- LR_A = 0.0001 # learning rate for actor
273
- LR_C = 0.0001 # learning rate for critic
274
- GAMMA = 0.999 # reward discount
275
- REPLACE_ITER_A = 1700
276
- REPLACE_ITER_C = 1500
277
- MEMORY_CAPACITY = 200000
278
- BATCH_SIZE = 32
279
- DISPLAY_THRESHOLD = 60
280
- DATA_PATH = './data'
281
- LOAD_MODEL = True
282
- SAVE_MODEL_ITER = 50000
283
- RENDER = False
284
- OUTPUT_GRAPH = False
285
- ENV_NAME = 'BipedalWalker-v2'
286
-
287
- GLOBAL_STEP = tf .Variable (0 , trainable = False )
288
- INCREASE_GS = GLOBAL_STEP .assign (tf .add (GLOBAL_STEP , 1 ))
289
- END_POINT = (200 - 10 ) * (14 / 30 ) # from game
290
-
291
- env = gym .make (ENV_NAME )
292
- env .seed (1 )
293
-
294
- state_dim = env .observation_space .shape [0 ] # 24
295
- action_dim = env .action_space .shape [0 ] # 4
296
- action_bound = env .action_space .high # [1, 1, 1, 1]
297
-
298
- # all placeholder for tf
299
- with tf .name_scope ('S' ):
300
- S = tf .placeholder (tf .float32 , shape = [None , state_dim ], name = 's' )
301
- with tf .name_scope ('A' ):
302
- A = tf .placeholder (tf .float32 , shape = [None , action_dim ], name = 'a' )
303
- with tf .name_scope ('R' ):
304
- R = tf .placeholder (tf .float32 , [None , 1 ], name = 'r' )
305
- with tf .name_scope ('S_' ):
306
- S_ = tf .placeholder (tf .float32 , shape = [None , state_dim ], name = 's_' )
307
-
308
308
sess = tf .Session ()
309
309
310
310
# Create actor and critic.
311
- actor = Actor (sess , action_dim , action_bound , LR_A , REPLACE_ITER_A )
312
- critic = Critic (sess , state_dim , action_dim , LR_C , GAMMA , REPLACE_ITER_C , actor .a_ )
311
+ actor = Actor (sess , ACTION_DIM , ACTION_BOUND , LR_A , REPLACE_ITER_A )
312
+ critic = Critic (sess , STATE_DIM , ACTION_DIM , LR_C , GAMMA , REPLACE_ITER_C , actor .a_ )
313
313
actor .add_grad_to_graph (critic .a_grads )
314
314
315
315
M = Memory (MEMORY_CAPACITY )
@@ -328,7 +328,7 @@ def _get_priority(self, error):
328
328
tf .summary .FileWriter ('logs' , graph = sess .graph )
329
329
330
330
var = 3 # control exploration
331
- var_min = 0.008
331
+ var_min = 0.001
332
332
333
333
for i_episode in range (MAX_EPISODES ):
334
334
# s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.)
@@ -342,17 +342,19 @@ def _get_priority(self, error):
342
342
s_ , r , done , _ = env .step (a ) # r = total 300+ points up to the far end. If the robot falls, it gets -100.
343
343
344
344
if r == - 100 : r = - 2
345
+ ep_r += r
346
+
345
347
transition = np .hstack ((s , a , [r ], s_ ))
346
348
max_p = np .max (M .tree .tree [- M .tree .capacity :])
347
349
M .store (max_p , transition )
348
350
349
351
if GLOBAL_STEP .eval (sess ) > MEMORY_CAPACITY / 20 :
350
- var = max ([var * 0.99995 , var_min ]) # decay the action randomness
352
+ var = max ([var * 0.9999 , var_min ]) # decay the action randomness
351
353
tree_idx , b_M , ISWeights = M .prio_sample (BATCH_SIZE ) # for critic update
352
- b_s = b_M [:, :state_dim ]
353
- b_a = b_M [:, state_dim : state_dim + action_dim ]
354
- b_r = b_M [:, - state_dim - 1 : - state_dim ]
355
- b_s_ = b_M [:, - state_dim :]
354
+ b_s = b_M [:, :STATE_DIM ]
355
+ b_a = b_M [:, STATE_DIM : STATE_DIM + ACTION_DIM ]
356
+ b_r = b_M [:, - STATE_DIM - 1 : - STATE_DIM ]
357
+ b_s_ = b_M [:, - STATE_DIM :]
356
358
357
359
abs_td = critic .learn (b_s , b_a , b_r , b_s_ , ISWeights )
358
360
actor .learn (b_s , b_a )
@@ -379,9 +381,10 @@ def _get_priority(self, error):
379
381
'| Epi_r: %.2f' % ep_r ,
380
382
'| Exploration: %.3f' % var ,
381
383
'| Pos: %.i' % int (env .unwrapped .hull .position [0 ]),
384
+ '| LR_A: %.6f' % sess .run (LR_A ),
385
+ '| LR_C: %.6f' % sess .run (LR_C ),
382
386
)
383
387
break
384
388
385
389
s = s_
386
- ep_r += r
387
390
sess .run (INCREASE_GS )
0 commit comments