1
1
"""
2
- Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
2
+ Asynchronous Advantage Actor Critic (A3C) with continuous action space , Reinforcement Learning.
3
3
4
4
The Pendulum example. Version 1: convergence promised
5
5
22
22
tf .set_random_seed (2 ) # reproducible
23
23
24
24
GAME = 'Pendulum-v0'
25
- OUTPUT_GRAPH = False
25
+ OUTPUT_GRAPH = True
26
26
LOG_DIR = './log'
27
27
N_WORKERS = multiprocessing .cpu_count ()
28
28
MAX_EP_STEP = 300
29
- MAX_GLOBAL_EP = 800
29
+ MAX_GLOBAL_EP = 1000
30
30
GLOBAL_NET_SCOPE = 'Global_Net'
31
31
UPDATE_GLOBAL_ITER = 5
32
32
GAMMA = 0.9
33
+ ENTROPY_BETA = 0.01
33
34
LR_A = 0.001 # learning rate for actor
34
- LR_C = 0.002 # learning rate for critic
35
+ LR_C = 0.001 # learning rate for critic
35
36
36
37
env = gym .make (GAME )
37
- env .seed (1 ) # reproducible
38
38
39
39
N_S = env .observation_space .shape [0 ]
40
40
N_A = env .action_space .shape [0 ]
@@ -57,23 +57,29 @@ def __init__(self, scope, n_s, n_a,
57
57
with tf .variable_scope (scope ):
58
58
self .s = tf .placeholder (tf .float32 , [None , n_s ], 'S' )
59
59
self .a_his = tf .placeholder (tf .float32 , [None , n_a ], 'A' )
60
- self .v_target = tf .placeholder (tf .float32 , [None , 1 ], 'R ' )
60
+ self .v_target = tf .placeholder (tf .float32 , [None , 1 ], 'Vtarget ' )
61
61
62
62
mu , sigma , self .v = self ._build_net (n_a )
63
63
64
64
td = tf .subtract (self .v_target , self .v , name = 'TD_error' )
65
65
with tf .name_scope ('c_loss' ):
66
- self .c_loss = tf .reduce_mean (tf .square (td ))
67
- self .mu , self .sigma = tf .squeeze (mu * a_bound [1 ]), tf .squeeze (sigma + 1e-2 )
68
- self .normal_dist = tf .contrib .distributions .Normal (self .mu , self .sigma )
66
+ self .c_loss = tf .reduce_sum (tf .square (td ))
67
+
68
+ with tf .name_scope ('wrap_a_out' ):
69
+ mu , sigma = mu * a_bound [1 ], sigma * 2 + 1e-2
70
+ self .test = sigma [0 ]
71
+
72
+ normal_dist = tf .contrib .distributions .Normal (mu , sigma )
73
+
69
74
with tf .name_scope ('a_loss' ):
70
- log_prob = self .normal_dist .log_prob (self .a_his )
71
- self .exp_v = tf .reduce_mean (log_prob * td )
72
- self .exp_v += 0.01 * self .normal_dist .entropy () # encourage exploration
75
+ log_prob = normal_dist .log_prob (self .a_his )
76
+ exp_v = log_prob * td
77
+ entropy = normal_dist .entropy () # encourage exploration
78
+ self .exp_v = tf .reduce_sum (ENTROPY_BETA * entropy + exp_v )
73
79
self .a_loss = - self .exp_v
74
80
75
81
with tf .name_scope ('choose_a' ): # use local params to choose action
76
- self .A = tf .clip_by_value (self . normal_dist .sample ([ 1 ] ), a_bound [0 ], a_bound [1 ])
82
+ self .A = tf .clip_by_value (tf . squeeze ( normal_dist .sample (1 ), axis = 0 ), a_bound [0 ], a_bound [1 ])
77
83
with tf .name_scope ('local_grad' ):
78
84
self .a_params = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , scope = scope + '/actor' )
79
85
self .c_params = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , scope = scope + '/critic' )
@@ -91,11 +97,11 @@ def __init__(self, scope, n_s, n_a,
91
97
def _build_net (self , n_a ):
92
98
w_init = tf .random_normal_initializer (0. , .1 )
93
99
with tf .variable_scope ('actor' ):
94
- l_a = tf .layers .dense (self .s , 100 , tf .nn .relu , kernel_initializer = w_init , name = 'la' )
100
+ l_a = tf .layers .dense (self .s , 50 , tf .nn .relu , kernel_initializer = w_init , name = 'la' )
95
101
mu = tf .layers .dense (l_a , n_a , tf .nn .tanh , kernel_initializer = w_init , name = 'mu' )
96
- sigma = tf .layers .dense (l_a , n_a , tf .nn .softplus , kernel_initializer = w_init , name = 'sigma' )
102
+ sigma = tf .layers .dense (l_a , n_a , tf .nn .sigmoid , kernel_initializer = w_init , name = 'sigma' )
97
103
with tf .variable_scope ('critic' ):
98
- l_c = tf .layers .dense (self .s , 60 , tf .nn .relu , kernel_initializer = w_init , name = 'lc' )
104
+ l_c = tf .layers .dense (self .s , 50 , tf .nn .relu , kernel_initializer = w_init , name = 'lc' )
99
105
v = tf .layers .dense (l_c , 1 , kernel_initializer = w_init , name = 'v' ) # state value
100
106
return mu , sigma , v
101
107
@@ -107,7 +113,7 @@ def pull_global(self): # run by a local
107
113
108
114
def choose_action (self , s ): # run by a local
109
115
s = s [np .newaxis , :]
110
- return self .sess .run (self .A , {self .s : s })
116
+ return self .sess .run (self .A , {self .s : s })[ 0 ]
111
117
112
118
113
119
class Worker (object ):
@@ -119,7 +125,7 @@ def __init__(self, env, name, n_s, n_a, a_bound, sess, opt_a, opt_c, g_a_params,
119
125
120
126
def work (self , update_iter , max_ep_step , gamma , coord ):
121
127
total_step = 1
122
- buffer_s , buffer_a , buffer_r , buffer_s_ = [], [], [], []
128
+ buffer_s , buffer_a , buffer_r = [], [], []
123
129
while not coord .should_stop () and GLOBAL_EP .eval (self .sess ) < MAX_GLOBAL_EP :
124
130
s = self .env .reset ()
125
131
ep_r = 0
@@ -128,27 +134,31 @@ def work(self, update_iter, max_ep_step, gamma, coord):
128
134
self .env .render ()
129
135
a = self .AC .choose_action (s )
130
136
s_ , r , done , info = self .env .step (a )
131
- r /= 10
137
+ r /= 10 # normalize reward
132
138
ep_r += r
133
139
buffer_s .append (s )
134
140
buffer_a .append (a )
135
141
buffer_r .append (r )
136
- buffer_s_ .append (s_ )
137
142
138
143
if total_step % update_iter == 0 or done : # update global and assign to local net
139
- buffer_s , buffer_a , buffer_r , buffer_s_ = np .vstack (buffer_s ), np .vstack (buffer_a ), np .vstack (buffer_r ), np .vstack (buffer_s_ )
140
-
141
- v_next = self .sess .run (self .AC .v , {self .AC .s : buffer_s_ })
142
- if done : v_next [- 1 , 0 ] = 0
143
- v_target = buffer_r + gamma * v_next
144
-
144
+ if done :
145
+ v_s_ = 0 # terminal
146
+ else :
147
+ v_s_ = self .sess .run (self .AC .v , {self .AC .s : s_ [np .newaxis , :]})[0 , 0 ]
148
+ buffer_v_target = []
149
+ for r in buffer_r [::- 1 ]: # reverse buffer r
150
+ v_s_ = r + gamma * v_s_
151
+ buffer_v_target .append (v_s_ )
152
+ buffer_v_target .reverse ()
153
+
154
+ buffer_s , buffer_a , buffer_v_target = np .vstack (buffer_s ), np .vstack (buffer_a ), np .vstack (buffer_v_target )
145
155
feed_dict = {
146
156
self .AC .s : buffer_s ,
147
157
self .AC .a_his : buffer_a ,
148
- self .AC .v_target : v_target ,
158
+ self .AC .v_target : buffer_v_target ,
149
159
}
150
160
self .AC .update_global (feed_dict )
151
- buffer_s , buffer_a , buffer_r , buffer_s_ = [], [], [], []
161
+ buffer_s , buffer_a , buffer_r = [], [], []
152
162
self .AC .pull_global ()
153
163
154
164
s = s_
@@ -168,8 +178,8 @@ def work(self, update_iter, max_ep_step, gamma, coord):
168
178
with tf .device ("/cpu:0" ):
169
179
GLOBAL_EP = tf .Variable (0 , dtype = tf .int32 , name = 'global_ep' , trainable = False )
170
180
COUNT_GLOBAL_EP = tf .assign (GLOBAL_EP , tf .add (GLOBAL_EP , tf .constant (1 ), name = 'step_ep' ))
171
- OPT_A = tf .train .RMSPropOptimizer (LR_A )
172
- OPT_C = tf .train .RMSPropOptimizer (LR_C )
181
+ OPT_A = tf .train .RMSPropOptimizer (LR_A , name = 'RMSPropA' )
182
+ OPT_C = tf .train .RMSPropOptimizer (LR_C , name = 'RMSPropC' )
173
183
globalAC = ACNet (GLOBAL_NET_SCOPE , N_S , N_A ) # we only need its params
174
184
workers = []
175
185
# Create worker
0 commit comments