Skip to content

Commit 600e477

Browse files
committed
edit
1 parent da50a43 commit 600e477

File tree

5 files changed

+146
-173
lines changed

5 files changed

+146
-173
lines changed

Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_CartPole.py

+66-63
Original file line numberDiff line numberDiff line change
@@ -22,88 +22,90 @@ class Actor(object):
2222
def __init__(self, sess, n_features, n_actions, lr=0.001):
2323
self.sess = sess
2424

25-
self.state = tf.placeholder(tf.float32, [1, n_features], "state")
26-
self.act_index = tf.placeholder(tf.int32, name="act")
27-
self.td_error = tf.placeholder(tf.float32, name="td_error") # TD_error
28-
29-
l1 = tf.layers.dense(
30-
inputs=self.state,
31-
units=20, # number of hidden units
32-
activation=tf.nn.relu,
33-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
34-
bias_initializer=tf.constant_initializer(0.1), # biases
35-
name='l1'
36-
)
37-
38-
self.acts_prob = tf.layers.dense(
39-
inputs=l1,
40-
units=n_actions, # output units
41-
activation=tf.nn.softmax, # get action probabilities
42-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
43-
bias_initializer=tf.constant_initializer(0.1), # biases
44-
name='acts_prob'
45-
)
25+
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
26+
self.a = tf.placeholder(tf.int32, None, "act")
27+
self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error
28+
29+
with tf.variable_scope('Actor'):
30+
l1 = tf.layers.dense(
31+
inputs=self.s,
32+
units=20, # number of hidden units
33+
activation=tf.nn.relu,
34+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
35+
bias_initializer=tf.constant_initializer(0.1), # biases
36+
name='l1'
37+
)
38+
39+
self.acts_prob = tf.layers.dense(
40+
inputs=l1,
41+
units=n_actions, # output units
42+
activation=tf.nn.softmax, # get action probabilities
43+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
44+
bias_initializer=tf.constant_initializer(0.1), # biases
45+
name='acts_prob'
46+
)
4647

4748
with tf.variable_scope('exp_v'):
48-
log_prob = tf.log(self.acts_prob[0, self.act_index])
49-
self.exp_r = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
49+
log_prob = tf.log(self.acts_prob[0, self.a])
50+
self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
5051

5152
with tf.variable_scope('train'):
52-
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_r) # minimize(-exp_v) = maximize(exp_v)
53+
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
5354

54-
def update(self, s, a, td):
55+
def learn(self, s, a, td):
5556
s = s[np.newaxis, :]
56-
feed_dict = {self.state: s, self.act_index: a, self.td_error: td}
57-
_, exp_v = self.sess.run([self.train_op, self.exp_r], feed_dict)
57+
feed_dict = {self.s: s, self.a: a, self.td_error: td}
58+
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
5859
return exp_v
5960

6061
def choose_action(self, s):
6162
s = s[np.newaxis, :]
62-
probs = self.sess.run(self.acts_prob, {self.state: s}) # get probabilities for all actions
63+
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
6364
return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
6465

6566

6667
class Critic(object):
6768
def __init__(self, sess, n_features, lr=0.01):
6869
self.sess = sess
6970

70-
self.state = tf.placeholder(tf.float32, [1, n_features], "state")
71-
self.v_next = tf.placeholder(tf.float32, [1, 1], name="v_next")
72-
self.r = tf.placeholder(tf.float32, name='r')
73-
74-
l1 = tf.layers.dense(
75-
inputs=self.state,
76-
units=20, # number of hidden units
77-
activation=tf.nn.relu, # None
78-
# have to be linear to make sure the convergence of actor.
79-
# But linear approximator seems hardly learns the correct Q.
80-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
81-
bias_initializer=tf.constant_initializer(0.1), # biases
82-
name='l1'
83-
)
84-
85-
self.v = tf.layers.dense(
86-
inputs=l1,
87-
units=1, # output units
88-
activation=None,
89-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
90-
bias_initializer=tf.constant_initializer(0.1), # biases
91-
name='V'
92-
)
71+
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
72+
self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
73+
self.r = tf.placeholder(tf.float32, None, 'r')
74+
75+
with tf.variable_scope('Critic'):
76+
l1 = tf.layers.dense(
77+
inputs=self.s,
78+
units=20, # number of hidden units
79+
activation=tf.nn.relu, # None
80+
# have to be linear to make sure the convergence of actor.
81+
# But linear approximator seems hardly learns the correct Q.
82+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
83+
bias_initializer=tf.constant_initializer(0.1), # biases
84+
name='l1'
85+
)
86+
87+
self.v = tf.layers.dense(
88+
inputs=l1,
89+
units=1, # output units
90+
activation=None,
91+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
92+
bias_initializer=tf.constant_initializer(0.1), # biases
93+
name='V'
94+
)
9395

9496
with tf.variable_scope('squared_TD_error'):
95-
self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_next - self.v)
97+
self.td_error = self.r + GAMMA * self.v_ - self.v
9698
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
9799
with tf.variable_scope('train'):
98100
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
99101

100-
def update(self, s, r, s_):
102+
def learn(self, s, r, s_):
101103
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
102104

103-
v_next = self.sess.run(self.v, {self.state: s_})
104-
td_error, loss, _ = self.sess.run([self.td_error, self.loss, self.train_op],
105-
{self.state: s, self.v_next: v_next, self.r: r})
106-
return td_error, loss
105+
v_ = self.sess.run(self.v, {self.s: s_})
106+
td_error, _ = self.sess.run([self.td_error, self.train_op],
107+
{self.s: s, self.v_: v_, self.r: r})
108+
return td_error
107109

108110

109111

@@ -121,12 +123,13 @@ def update(self, s, r, s_):
121123
env = gym.make('CartPole-v0')
122124
env.seed(1) # reproducible
123125

126+
N_F = env.observation_space.shape[0]
127+
N_A = env.action_space.n
128+
124129
sess = tf.Session()
125130

126-
with tf.variable_scope('Actor'):
127-
actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=LR_A)
128-
with tf.variable_scope('Critic'):
129-
critic = Critic(sess, n_features=env.observation_space.shape[0], lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
131+
actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
132+
critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
130133

131134
sess.run(tf.global_variables_initializer())
132135

@@ -148,8 +151,8 @@ def update(self, s, r, s_):
148151

149152
track_r.append(r)
150153

151-
td_error, loss = critic.update(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
152-
actor.update(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
154+
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
155+
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
153156

154157
s = s_
155158
t += 1

Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_continue_Pendulum.py

+53-50
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
33
4-
The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb)
4+
The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
55
66
Cannot converge!!!
77
@@ -21,18 +21,17 @@
2121

2222

2323
class Actor(object):
24-
def __init__(self, sess, n_features, action_range, lr=0.0001):
24+
def __init__(self, sess, n_features, action_bound, lr=0.0001):
2525
self.sess = sess
26-
with tf.name_scope('inputs'):
27-
self.state = tf.placeholder(tf.float32, [n_features, ], "state")
28-
state = tf.expand_dims(self.state, axis=0)
29-
self.act = tf.placeholder(tf.float32, name="act")
30-
self.advantage = tf.placeholder(tf.float32, name="adv") # TD_error
26+
27+
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
28+
self.a = tf.placeholder(tf.float32, None, name="act")
29+
self.td_error = tf.placeholder(tf.float32, None, name="td_error") # TD_error
3130

3231
l1 = tf.layers.dense(
33-
inputs=state,
32+
inputs=self.s,
3433
units=30, # number of hidden units
35-
activation=None,
34+
activation=tf.nn.relu,
3635
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
3736
bias_initializer=tf.constant_initializer(0.1), # biases
3837
name='l1'
@@ -50,78 +49,83 @@ def __init__(self, sess, n_features, action_range, lr=0.0001):
5049
sigma = tf.layers.dense(
5150
inputs=l1,
5251
units=1, # output units
53-
activation=tf.nn.relu, # get action probabilities
52+
activation=tf.nn.sigmoid, # get action probabilities
5453
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
5554
bias_initializer=tf.constant_initializer(1.), # biases
5655
name='sigma'
5756
)
58-
59-
self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+1e-2)
57+
global_step = tf.Variable(0, trainable=False)
58+
# self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
59+
self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
6060
self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
6161

62-
self.action = tf.clip_by_value(self.normal_dist.sample(1), action_range[0], action_range[1])
62+
self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
6363

64-
with tf.name_scope('loss'):
65-
neg_log_prob = -self.normal_dist.log_prob(self.act) # loss without advantage
66-
self.loss = neg_log_prob * self.advantage # advantage (TD_error) guided loss
64+
with tf.name_scope('exp_v'):
65+
log_prob = self.normal_dist.log_prob(self.a) # loss without advantage
66+
self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss
6767
# Add cross entropy cost to encourage exploration
68-
self.loss -= 1e-1 * self.normal_dist.entropy()
68+
self.exp_v += self.normal_dist.entropy()
6969

7070
with tf.name_scope('train'):
71-
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
71+
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v)
7272

73-
def update(self, s, a, adv):
74-
feed_dict = {self.state: s, self.act: a, self.advantage: adv}
75-
_, loss = self.sess.run([self.train_op, self.loss], feed_dict)
76-
return loss
73+
def learn(self, s, a, td):
74+
s = s[np.newaxis, :]
75+
feed_dict = {self.s: s, self.a: a, self.td_error: td}
76+
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
77+
return exp_v
7778

7879
def choose_action(self, s):
79-
return self.sess.run([self.action, self.mu, self.sigma], {self.state: s}) # get probabilities for all actions
80+
s = s[np.newaxis, :]
81+
return self.sess.run(self.action, {self.s: s}) # get probabilities for all actions
8082

8183

8284
class Critic(object):
8385
def __init__(self, sess, n_features, lr=0.01):
8486
self.sess = sess
8587
with tf.name_scope('inputs'):
86-
self.state = tf.placeholder(tf.float32, [n_features, ], "state")
87-
state = tf.expand_dims(self.state, axis=0)
88-
self.target = tf.placeholder(dtype=tf.float32, name="target") # TD target=r+gamma*V_next
88+
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
89+
self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
90+
self.r = tf.placeholder(tf.float32, name='r')
8991

9092
with tf.variable_scope('Critic'):
9193
l1 = tf.layers.dense(
92-
inputs=state,
94+
inputs=self.s,
9395
units=30, # number of hidden units
94-
activation=None,
96+
activation=tf.nn.relu,
9597
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
9698
bias_initializer=tf.constant_initializer(0.1), # biases
9799
name='l1'
98100
)
99101

100-
self.eval = tf.layers.dense(
102+
self.v = tf.layers.dense(
101103
inputs=l1,
102104
units=1, # output units
103105
activation=None,
104106
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
105107
bias_initializer=tf.constant_initializer(0.1), # biases
106-
name='l2'
108+
name='V'
107109
)
108110

109-
with tf.name_scope('loss'):
110-
self.loss = tf.reduce_mean(tf.squared_difference(self.target, self.eval)) # TD_error = (r+gamma*V_next) - V_eval
111-
with tf.name_scope('train'):
112-
self.train_op = tf.train.RMSPropOptimizer(lr).minimize(self.loss)
111+
with tf.variable_scope('squared_TD_error'):
112+
self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
113+
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
114+
with tf.variable_scope('train'):
115+
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
113116

114-
def update(self, s, target):
115-
_, loss = self.sess.run([self.train_op, self.loss], {self.state: s, self.target: target})
116-
return loss
117+
def learn(self, s, r, s_):
118+
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
117119

118-
def evaluate(self, s):
119-
return self.sess.run(self.eval, {self.state: s})[0, 0] # return a float
120+
v_ = self.sess.run(self.v, {self.s: s_})
121+
td_error, _ = self.sess.run([self.td_error, self.train_op],
122+
{self.s: s, self.v_: v_, self.r: r})
123+
return td_error
120124

121125

122126
OUTPUT_GRAPH = False
123127
MAX_EPISODE = 3000
124-
EPISODE_TIME_THRESHOLD = 300
128+
MAX_EP_STEPS = 300
125129
DISPLAY_REWARD_THRESHOLD = -550 # renders environment if total episode reward is greater then this threshold
126130
RENDER = False # rendering wastes time
127131
GAMMA = 0.9
@@ -131,10 +135,13 @@ def evaluate(self, s):
131135
env = gym.make('Pendulum-v0')
132136
env.seed(1) # reproducible
133137

138+
N_S = env.observation_space.shape[0]
139+
A_BOUND = env.action_space.high
140+
134141
sess = tf.Session()
135142

136-
actor = Actor(sess, n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A)
137-
critic = Critic(sess, n_features=env.observation_space.shape[0], lr=LR_C)
143+
actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
144+
critic = Critic(sess, n_features=N_S, lr=LR_C)
138145

139146
sess.run(tf.global_variables_initializer())
140147

@@ -148,22 +155,18 @@ def evaluate(self, s):
148155
while True:
149156
# if RENDER:
150157
env.render()
151-
a, mu, sigma = actor.choose_action(s)
158+
a = actor.choose_action(s)
152159

153160
s_, r, done, info = env.step(a)
154161
r /= 10
155-
TD_target = r + GAMMA * critic.evaluate(s_) # r + gamma * V_next
156-
TD_eval = critic.evaluate(s) # V_now
157-
TD_error = TD_target - TD_eval
158162

159-
actor.update(s=s, a=a, adv=TD_error)
160-
critic.update(s=s, target=TD_target)
163+
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
164+
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
161165

162166
s = s_
163167
t += 1
164-
# print(reward)
165168
ep_rs.append(r)
166-
if t > EPISODE_TIME_THRESHOLD:
169+
if t > MAX_EP_STEPS:
167170
ep_rs_sum = sum(ep_rs)
168171
if 'running_reward' not in globals():
169172
running_reward = ep_rs_sum

Reinforcement_learning_TUT/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,11 @@ def set_grad_from_critic(self, a_grads):
6868
# xs = policy's parameters;
6969
# self.a_grads = the gradients of the policy to get more Q
7070
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
71-
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
71+
self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
7272

7373
with tf.variable_scope('A_train'):
7474
opt = tf.train.AdamOptimizer(-self.lr) # (- learning rate) for ascent policy
75-
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
75+
self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params))
7676

7777

7878
############################### Critic ####################################

0 commit comments

Comments
 (0)