1
1
"""
2
2
Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
3
3
4
- The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/CliffWalk %20Actor%20Critic%20Solution.ipynb)
4
+ The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar %20Actor%20Critic%20Solution.ipynb)
5
5
6
6
Cannot converge!!!
7
7
21
21
22
22
23
23
class Actor (object ):
24
- def __init__ (self , sess , n_features , action_range , lr = 0.0001 ):
24
+ def __init__ (self , sess , n_features , action_bound , lr = 0.0001 ):
25
25
self .sess = sess
26
- with tf .name_scope ('inputs' ):
27
- self .state = tf .placeholder (tf .float32 , [n_features , ], "state" )
28
- state = tf .expand_dims (self .state , axis = 0 )
29
- self .act = tf .placeholder (tf .float32 , name = "act" )
30
- self .advantage = tf .placeholder (tf .float32 , name = "adv" ) # TD_error
26
+
27
+ self .s = tf .placeholder (tf .float32 , [1 , n_features ], "state" )
28
+ self .a = tf .placeholder (tf .float32 , None , name = "act" )
29
+ self .td_error = tf .placeholder (tf .float32 , None , name = "td_error" ) # TD_error
31
30
32
31
l1 = tf .layers .dense (
33
- inputs = state ,
32
+ inputs = self . s ,
34
33
units = 30 , # number of hidden units
35
- activation = None ,
34
+ activation = tf . nn . relu ,
36
35
kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
37
36
bias_initializer = tf .constant_initializer (0.1 ), # biases
38
37
name = 'l1'
@@ -50,78 +49,83 @@ def __init__(self, sess, n_features, action_range, lr=0.0001):
50
49
sigma = tf .layers .dense (
51
50
inputs = l1 ,
52
51
units = 1 , # output units
53
- activation = tf .nn .relu , # get action probabilities
52
+ activation = tf .nn .sigmoid , # get action probabilities
54
53
kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
55
54
bias_initializer = tf .constant_initializer (1. ), # biases
56
55
name = 'sigma'
57
56
)
58
-
59
- self .mu , self .sigma = tf .squeeze (mu * 2 ), tf .squeeze (sigma + 1e-2 )
57
+ global_step = tf .Variable (0 , trainable = False )
58
+ # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
59
+ self .mu , self .sigma = tf .squeeze (mu * 2 ), tf .squeeze (sigma + 0.1 )
60
60
self .normal_dist = tf .contrib .distributions .Normal (self .mu , self .sigma )
61
61
62
- self .action = tf .clip_by_value (self .normal_dist .sample (1 ), action_range [0 ], action_range [1 ])
62
+ self .action = tf .clip_by_value (self .normal_dist .sample (1 ), action_bound [0 ], action_bound [1 ])
63
63
64
- with tf .name_scope ('loss ' ):
65
- neg_log_prob = - self .normal_dist .log_prob (self .act ) # loss without advantage
66
- self .loss = neg_log_prob * self .advantage # advantage (TD_error) guided loss
64
+ with tf .name_scope ('exp_v ' ):
65
+ log_prob = self .normal_dist .log_prob (self .a ) # loss without advantage
66
+ self .exp_v = log_prob * self .td_error # advantage (TD_error) guided loss
67
67
# Add cross entropy cost to encourage exploration
68
- self .loss -= 1e-1 * self .normal_dist .entropy ()
68
+ self .exp_v += self .normal_dist .entropy ()
69
69
70
70
with tf .name_scope ('train' ):
71
- self .train_op = tf .train .AdamOptimizer (lr ).minimize (self .loss )
71
+ self .train_op = tf .train .AdamOptimizer (lr ).minimize (- self .exp_v , global_step ) # min(v) = max(-v )
72
72
73
- def update (self , s , a , adv ):
74
- feed_dict = {self .state : s , self .act : a , self .advantage : adv }
75
- _ , loss = self .sess .run ([self .train_op , self .loss ], feed_dict )
76
- return loss
73
+ def learn (self , s , a , td ):
74
+ s = s [np .newaxis , :]
75
+ feed_dict = {self .s : s , self .a : a , self .td_error : td }
76
+ _ , exp_v = self .sess .run ([self .train_op , self .exp_v ], feed_dict )
77
+ return exp_v
77
78
78
79
def choose_action (self , s ):
79
- return self .sess .run ([self .action , self .mu , self .sigma ], {self .state : s }) # get probabilities for all actions
80
+ s = s [np .newaxis , :]
81
+ return self .sess .run (self .action , {self .s : s }) # get probabilities for all actions
80
82
81
83
82
84
class Critic (object ):
83
85
def __init__ (self , sess , n_features , lr = 0.01 ):
84
86
self .sess = sess
85
87
with tf .name_scope ('inputs' ):
86
- self .state = tf .placeholder (tf .float32 , [n_features , ], "state" )
87
- state = tf .expand_dims ( self . state , axis = 0 )
88
- self .target = tf .placeholder (dtype = tf .float32 , name = "target" ) # TD target=r+gamma*V_next
88
+ self .s = tf .placeholder (tf .float32 , [1 , n_features ], "state" )
89
+ self . v_ = tf .placeholder ( tf . float32 , [ 1 , 1 ], name = "v_next" )
90
+ self .r = tf .placeholder (tf .float32 , name = 'r' )
89
91
90
92
with tf .variable_scope ('Critic' ):
91
93
l1 = tf .layers .dense (
92
- inputs = state ,
94
+ inputs = self . s ,
93
95
units = 30 , # number of hidden units
94
- activation = None ,
96
+ activation = tf . nn . relu ,
95
97
kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
96
98
bias_initializer = tf .constant_initializer (0.1 ), # biases
97
99
name = 'l1'
98
100
)
99
101
100
- self .eval = tf .layers .dense (
102
+ self .v = tf .layers .dense (
101
103
inputs = l1 ,
102
104
units = 1 , # output units
103
105
activation = None ,
104
106
kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
105
107
bias_initializer = tf .constant_initializer (0.1 ), # biases
106
- name = 'l2 '
108
+ name = 'V '
107
109
)
108
110
109
- with tf .name_scope ('loss' ):
110
- self .loss = tf .reduce_mean (tf .squared_difference (self .target , self .eval )) # TD_error = (r+gamma*V_next) - V_eval
111
- with tf .name_scope ('train' ):
112
- self .train_op = tf .train .RMSPropOptimizer (lr ).minimize (self .loss )
111
+ with tf .variable_scope ('squared_TD_error' ):
112
+ self .td_error = tf .reduce_mean (self .r + GAMMA * self .v_ - self .v )
113
+ self .loss = tf .square (self .td_error ) # TD_error = (r+gamma*V_next) - V_eval
114
+ with tf .variable_scope ('train' ):
115
+ self .train_op = tf .train .AdamOptimizer (lr ).minimize (self .loss )
113
116
114
- def update (self , s , target ):
115
- _ , loss = self .sess .run ([self .train_op , self .loss ], {self .state : s , self .target : target })
116
- return loss
117
+ def learn (self , s , r , s_ ):
118
+ s , s_ = s [np .newaxis , :], s_ [np .newaxis , :]
117
119
118
- def evaluate (self , s ):
119
- return self .sess .run (self .eval , {self .state : s })[0 , 0 ] # return a float
120
+ v_ = self .sess .run (self .v , {self .s : s_ })
121
+ td_error , _ = self .sess .run ([self .td_error , self .train_op ],
122
+ {self .s : s , self .v_ : v_ , self .r : r })
123
+ return td_error
120
124
121
125
122
126
OUTPUT_GRAPH = False
123
127
MAX_EPISODE = 3000
124
- EPISODE_TIME_THRESHOLD = 300
128
+ MAX_EP_STEPS = 300
125
129
DISPLAY_REWARD_THRESHOLD = - 550 # renders environment if total episode reward is greater then this threshold
126
130
RENDER = False # rendering wastes time
127
131
GAMMA = 0.9
@@ -131,10 +135,13 @@ def evaluate(self, s):
131
135
env = gym .make ('Pendulum-v0' )
132
136
env .seed (1 ) # reproducible
133
137
138
+ N_S = env .observation_space .shape [0 ]
139
+ A_BOUND = env .action_space .high
140
+
134
141
sess = tf .Session ()
135
142
136
- actor = Actor (sess , n_features = env . observation_space . shape [ 0 ], action_range = [ env . action_space . low [ 0 ], env . action_space . high [ 0 ]], lr = LR_A )
137
- critic = Critic (sess , n_features = env . observation_space . shape [ 0 ] , lr = LR_C )
143
+ actor = Actor (sess , n_features = N_S , lr = LR_A , action_bound = [ - A_BOUND , A_BOUND ] )
144
+ critic = Critic (sess , n_features = N_S , lr = LR_C )
138
145
139
146
sess .run (tf .global_variables_initializer ())
140
147
@@ -148,22 +155,18 @@ def evaluate(self, s):
148
155
while True :
149
156
# if RENDER:
150
157
env .render ()
151
- a , mu , sigma = actor .choose_action (s )
158
+ a = actor .choose_action (s )
152
159
153
160
s_ , r , done , info = env .step (a )
154
161
r /= 10
155
- TD_target = r + GAMMA * critic .evaluate (s_ ) # r + gamma * V_next
156
- TD_eval = critic .evaluate (s ) # V_now
157
- TD_error = TD_target - TD_eval
158
162
159
- actor . update ( s = s , a = a , adv = TD_error )
160
- critic . update ( s = s , target = TD_target )
163
+ td_error = critic . learn ( s , r , s_ ) # gradient = grad[r + gamma * V(s_) - V(s)]
164
+ actor . learn ( s , a , td_error ) # true_gradient = grad[logPi(s,a) * td_error]
161
165
162
166
s = s_
163
167
t += 1
164
- # print(reward)
165
168
ep_rs .append (r )
166
- if t > EPISODE_TIME_THRESHOLD :
169
+ if t > MAX_EP_STEPS :
167
170
ep_rs_sum = sum (ep_rs )
168
171
if 'running_reward' not in globals ():
169
172
running_reward = ep_rs_sum
0 commit comments