1
1
"""
2
2
Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
3
3
4
- The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb)
4
+ The cart pole example. Policy is oscillated.
5
5
6
6
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
7
7
19
19
20
20
21
21
class Actor (object ):
22
- def __init__ (self , n_features , n_actions , lr = 0.001 ):
23
- with tf . name_scope ( 'inputs' ):
24
- self . state = tf . placeholder ( tf . float32 , [ n_features , ], "state" )
25
- state = tf .expand_dims ( self . state , axis = 0 )
26
- self .act_index = tf .placeholder (tf .int32 , name = "act" )
27
- self .advantage = tf .placeholder (tf .float32 , name = "adv " ) # TD_error
28
-
29
- with tf .variable_scope ( 'Actor' ):
30
- l1 = tf . layers . dense (
31
- inputs = state ,
32
- units = 20 , # number of hidden units
33
- activation = tf .nn . tanh ,
34
- kernel_initializer = tf .random_normal_initializer (0. , . 1 ), # weights
35
- bias_initializer = tf . constant_initializer ( 0.1 ), # biases
36
- name = 'l1'
37
- )
38
-
39
- self . acts_prob = tf . layers . dense (
40
- inputs = l1 ,
41
- units = n_actions , # output units
42
- activation = tf .nn . softmax , # get action probabilities
43
- kernel_initializer = tf .random_normal_initializer (0. , . 1 ), # weights
44
- bias_initializer = tf . constant_initializer ( 0.1 ), # biases
45
- name = 'l2'
46
- )
47
-
48
- with tf .name_scope ( 'loss' ):
49
- neg_log_prob = - tf .log ( self . acts_prob [ 0 , self .act_index ] ) # loss without advantage
50
- self . loss = tf . reduce_mean ( neg_log_prob * self . advantage ) # advantage (TD_error) guided loss
51
-
52
- with tf .name_scope ( ' train' ):
53
- self . train_op = tf . train . AdamOptimizer ( lr ). minimize ( self . loss )
54
-
55
- def update ( self , s , a , adv ):
56
- feed_dict = {self .state : s , self .act_index : a , self .advantage : adv }
57
- _ , loss = self .sess .run ([self .train_op , self .loss ], feed_dict )
58
- return loss
22
+ def __init__ (self , sess , n_features , n_actions , lr = 0.001 ):
23
+ self . sess = sess
24
+
25
+ self . state = tf .placeholder ( tf . float32 , [ 1 , n_features ], "state" )
26
+ self .act_index = tf .placeholder (tf .int32 , name = "act" )
27
+ self .td_error = tf .placeholder (tf .float32 , name = "td_error " ) # TD_error
28
+
29
+ l1 = tf .layers . dense (
30
+ inputs = self . state ,
31
+ units = 20 , # number of hidden units
32
+ activation = tf . nn . relu ,
33
+ kernel_initializer = tf .random_normal_initializer ( 0. , .1 ), # weights
34
+ bias_initializer = tf .constant_initializer (0.1 ), # biases
35
+ name = 'l1'
36
+ )
37
+
38
+ self . acts_prob = tf . layers . dense (
39
+ inputs = l1 ,
40
+ units = n_actions , # output units
41
+ activation = tf . nn . softmax , # get action probabilities
42
+ kernel_initializer = tf .random_normal_initializer ( 0. , .1 ), # weights
43
+ bias_initializer = tf .constant_initializer (0.1 ), # biases
44
+ name = 'acts_prob'
45
+ )
46
+
47
+ with tf . variable_scope ( 'exp_v' ):
48
+ log_prob = tf .log ( self . acts_prob [ 0 , self . act_index ])
49
+ self . exp_r = tf .reduce_mean ( log_prob * self .td_error ) # advantage (TD_error) guided loss
50
+
51
+ with tf . variable_scope ( 'train' ):
52
+ self . train_op = tf .train . AdamOptimizer ( lr ). minimize ( - self . exp_r ) # minimize(-exp_v) = maximize(exp_v)
53
+
54
+ def update ( self , s , a , td ):
55
+ s = s [ np . newaxis , :]
56
+ feed_dict = {self .state : s , self .act_index : a , self .td_error : td }
57
+ _ , exp_v = self .sess .run ([self .train_op , self .exp_r ], feed_dict )
58
+ return exp_v
59
59
60
60
def choose_action (self , s ):
61
+ s = s [np .newaxis , :]
61
62
probs = self .sess .run (self .acts_prob , {self .state : s }) # get probabilities for all actions
62
63
return np .random .choice (np .arange (probs .shape [1 ]), p = probs .ravel ()) # return a int
63
64
64
65
65
66
class Critic (object ):
66
- def __init__ (self , n_features , lr = 0.01 ):
67
- with tf .name_scope ('inputs' ):
68
- self .state = tf .placeholder (tf .float32 , [n_features , ], "state" )
69
- state = tf .expand_dims (self .state , axis = 0 )
70
- self .target = tf .placeholder (dtype = tf .float32 , name = "target" ) # TD target=r+gamma*V_next
71
-
72
- with tf .variable_scope ('Critic' ):
73
- l1 = tf .layers .dense (
74
- inputs = state ,
75
- units = 20 , # number of hidden units
76
- activation = tf .nn .relu ,
77
- kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
78
- bias_initializer = tf .constant_initializer (0.1 ), # biases
79
- name = 'l1'
80
- )
81
-
82
- self .eval = tf .layers .dense (
83
- inputs = l1 ,
84
- units = 1 , # output units
85
- activation = None ,
86
- kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
87
- bias_initializer = tf .constant_initializer (0.1 ), # biases
88
- name = 'l2'
89
- )
90
-
91
- with tf .name_scope ('loss' ):
92
- self .loss = tf .reduce_mean (tf .squared_difference (self .target , self .eval )) # TD_error = (r+gamma*V_next) - V_eval
93
- with tf .name_scope ('train' ):
67
+ def __init__ (self , sess , n_features , lr = 0.01 ):
68
+ self .sess = sess
69
+
70
+ self .state = tf .placeholder (tf .float32 , [1 , n_features ], "state" )
71
+ self .v_next = tf .placeholder (tf .float32 , [1 , 1 ], name = "v_next" )
72
+ self .r = tf .placeholder (tf .float32 , name = 'r' )
73
+
74
+ l1 = tf .layers .dense (
75
+ inputs = self .state ,
76
+ units = 20 , # number of hidden units
77
+ activation = tf .nn .relu ,
78
+ kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
79
+ bias_initializer = tf .constant_initializer (0.1 ), # biases
80
+ name = 'l1'
81
+ )
82
+
83
+ self .v = tf .layers .dense (
84
+ inputs = l1 ,
85
+ units = 1 , # output units
86
+ activation = None ,
87
+ kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
88
+ bias_initializer = tf .constant_initializer (0.1 ), # biases
89
+ name = 'V'
90
+ )
91
+
92
+ with tf .variable_scope ('squared_TD_error' ):
93
+ self .td_error = tf .reduce_mean (self .r + GAMMA * self .v_next - self .v )
94
+ self .loss = tf .square (self .td_error ) # TD_error = (r+gamma*V_next) - V_eval
95
+ with tf .variable_scope ('train' ):
94
96
self .train_op = tf .train .AdamOptimizer (lr ).minimize (self .loss )
95
97
96
- def update (self , s , target ):
97
- _ , loss = self .sess .run ([self .train_op , self .loss ], {self .state : s , self .target : target })
98
- return loss
98
+ def update (self , s , r , s_ ):
99
+ s , s_ = s [np .newaxis , :], s_ [np .newaxis , :]
99
100
100
- def evaluate (self , s ):
101
- return self .sess .run (self .eval , {self .state : s })[0 , 0 ] # return a float
101
+ v_next = self .sess .run (self .v , {self .state : s_ })
102
+ td_error , loss , _ = self .sess .run ([self .td_error , self .loss , self .train_op ],
103
+ {self .state : s , self .v_next : v_next , self .r : r })
104
+ return td_error , loss
102
105
103
106
104
107
OUTPUT_GRAPH = False
@@ -110,53 +113,47 @@ def evaluate(self, s):
110
113
env = gym .make ('CartPole-v0' )
111
114
env .seed (1 ) # reproducible
112
115
113
- actor = Actor (n_features = env .observation_space .shape [0 ], n_actions = env .action_space .n , lr = 0.001 )
114
- critic = Critic (n_features = env .observation_space .shape [0 ], lr = 0.01 ) # we need a good teacher, so the teacher should learn faster than the actor
116
+ sess = tf .Session ()
115
117
116
- with tf .Session () as sess :
117
- if OUTPUT_GRAPH :
118
- tf .summary .FileWriter ("logs/" , sess .graph )
118
+ with tf .variable_scope ('Actor' ):
119
+ actor = Actor (sess , n_features = env .observation_space .shape [0 ], n_actions = env .action_space .n , lr = 0.001 )
120
+ with tf .variable_scope ('Critic' ):
121
+ critic = Critic (sess , n_features = env .observation_space .shape [0 ], lr = 0.01 ) # we need a good teacher, so the teacher should learn faster than the actor
119
122
120
- actor .sess , critic .sess = sess , sess # define the tf session
121
- tf .global_variables_initializer ().run ()
123
+ sess .run (tf .global_variables_initializer ())
122
124
123
- for i_episode in range (3000 ):
124
- observation = env .reset ()
125
- t = 0
126
- track_r = []
127
- while True :
128
- if RENDER : env .render ()
125
+ if OUTPUT_GRAPH :
126
+ tf .summary .FileWriter ("logs/" , sess .graph )
129
127
130
- action = actor .choose_action (observation )
128
+ for i_episode in range (3000 ):
129
+ s = env .reset ()
130
+ t = 0
131
+ track_r = []
132
+ while True :
133
+ if RENDER : env .render ()
131
134
132
- observation_ , reward , done , info = env . step ( action )
135
+ a = actor . choose_action ( s )
133
136
134
- x , x_dot , theta , theta_dot = observation_
135
- # the smaller theta and closer to center, the better
136
- r1 = (env .x_threshold - abs (x )) / env .x_threshold - 0.5
137
- r2 = (env .theta_threshold_radians - abs (theta )) / env .theta_threshold_radians - 0.5
138
- reward = r1 + r2
137
+ s_ , r , done , info = env .step (a )
139
138
140
- track_r . append ( reward )
139
+ if done : r = - 20
141
140
142
- TD_target = reward + GAMMA * critic .evaluate (observation_ ) # r + gamma * V_next
143
- TD_eval = critic .evaluate (observation ) # V_now
144
- TD_error = TD_target - TD_eval
141
+ track_r .append (r )
145
142
146
- actor .update (s = observation , a = action , adv = TD_error )
147
- critic .update (s = observation , target = TD_target )
143
+ td_error , loss = critic .update (s , r , s_ ) # gradient = grad[r + gamma * V(s_) - V(s)]
144
+ actor .update (s , a , td_error ) # true_gradient = grad[logPi(s,a) * td_error]
148
145
149
- observation = observation_
150
- t += 1
146
+ s = s_
147
+ t += 1
151
148
152
- if done or t >= EPISODE_TIME_THRESHOLD :
153
- ep_rs_sum = sum (track_r )
149
+ if done or t >= EPISODE_TIME_THRESHOLD :
150
+ ep_rs_sum = sum (track_r )
154
151
155
- if 'running_reward' not in globals ():
156
- running_reward = ep_rs_sum
157
- else :
158
- running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
159
- if running_reward > DISPLAY_REWARD_THRESHOLD : RENDER = True # rendering
160
- print ("episode:" , i_episode , " reward:" , int (running_reward ))
161
- break
152
+ if 'running_reward' not in globals ():
153
+ running_reward = ep_rs_sum
154
+ else :
155
+ running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
156
+ if running_reward > DISPLAY_REWARD_THRESHOLD : RENDER = True # rendering
157
+ print ("episode:" , i_episode , " reward:" , int (running_reward ))
158
+ break
162
159
0 commit comments