Skip to content

Commit 780dcd9

Browse files
committed
function approx with eligibility trace hardly works
1 parent 3c0027e commit 780dcd9

File tree

7 files changed

+569
-344
lines changed

7 files changed

+569
-344
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
33
4-
The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb)
4+
The cart pole example. Policy is oscillated.
55
66
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
77
@@ -19,86 +19,89 @@
1919

2020

2121
class Actor(object):
22-
def __init__(self, n_features, n_actions, lr=0.001):
23-
with tf.name_scope('inputs'):
24-
self.state = tf.placeholder(tf.float32, [n_features, ], "state")
25-
state = tf.expand_dims(self.state, axis=0)
26-
self.act_index = tf.placeholder(tf.int32, name="act")
27-
self.advantage = tf.placeholder(tf.float32, name="adv") # TD_error
28-
29-
with tf.variable_scope('Actor'):
30-
l1 = tf.layers.dense(
31-
inputs=state,
32-
units=20, # number of hidden units
33-
activation=tf.nn.tanh,
34-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
35-
bias_initializer=tf.constant_initializer(0.1), # biases
36-
name='l1'
37-
)
38-
39-
self.acts_prob = tf.layers.dense(
40-
inputs=l1,
41-
units=n_actions, # output units
42-
activation=tf.nn.softmax, # get action probabilities
43-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
44-
bias_initializer=tf.constant_initializer(0.1), # biases
45-
name='l2'
46-
)
47-
48-
with tf.name_scope('loss'):
49-
neg_log_prob = -tf.log(self.acts_prob[0, self.act_index]) # loss without advantage
50-
self.loss = tf.reduce_mean(neg_log_prob * self.advantage) # advantage (TD_error) guided loss
51-
52-
with tf.name_scope('train'):
53-
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
54-
55-
def update(self, s, a, adv):
56-
feed_dict = {self.state: s, self.act_index: a, self.advantage: adv}
57-
_, loss = self.sess.run([self.train_op, self.loss], feed_dict)
58-
return loss
22+
def __init__(self, sess, n_features, n_actions, lr=0.001):
23+
self.sess = sess
24+
25+
self.state = tf.placeholder(tf.float32, [1, n_features], "state")
26+
self.act_index = tf.placeholder(tf.int32, name="act")
27+
self.td_error = tf.placeholder(tf.float32, name="td_error") # TD_error
28+
29+
l1 = tf.layers.dense(
30+
inputs=self.state,
31+
units=20, # number of hidden units
32+
activation=tf.nn.relu,
33+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
34+
bias_initializer=tf.constant_initializer(0.1), # biases
35+
name='l1'
36+
)
37+
38+
self.acts_prob = tf.layers.dense(
39+
inputs=l1,
40+
units=n_actions, # output units
41+
activation=tf.nn.softmax, # get action probabilities
42+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
43+
bias_initializer=tf.constant_initializer(0.1), # biases
44+
name='acts_prob'
45+
)
46+
47+
with tf.variable_scope('exp_v'):
48+
log_prob = tf.log(self.acts_prob[0, self.act_index])
49+
self.exp_r = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
50+
51+
with tf.variable_scope('train'):
52+
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_r) # minimize(-exp_v) = maximize(exp_v)
53+
54+
def update(self, s, a, td):
55+
s = s[np.newaxis, :]
56+
feed_dict = {self.state: s, self.act_index: a, self.td_error: td}
57+
_, exp_v = self.sess.run([self.train_op, self.exp_r], feed_dict)
58+
return exp_v
5959

6060
def choose_action(self, s):
61+
s = s[np.newaxis, :]
6162
probs = self.sess.run(self.acts_prob, {self.state: s}) # get probabilities for all actions
6263
return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
6364

6465

6566
class Critic(object):
66-
def __init__(self, n_features, lr=0.01):
67-
with tf.name_scope('inputs'):
68-
self.state = tf.placeholder(tf.float32, [n_features, ], "state")
69-
state = tf.expand_dims(self.state, axis=0)
70-
self.target = tf.placeholder(dtype=tf.float32, name="target") # TD target=r+gamma*V_next
71-
72-
with tf.variable_scope('Critic'):
73-
l1 = tf.layers.dense(
74-
inputs=state,
75-
units=20, # number of hidden units
76-
activation=tf.nn.relu,
77-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
78-
bias_initializer=tf.constant_initializer(0.1), # biases
79-
name='l1'
80-
)
81-
82-
self.eval = tf.layers.dense(
83-
inputs=l1,
84-
units=1, # output units
85-
activation=None,
86-
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
87-
bias_initializer=tf.constant_initializer(0.1), # biases
88-
name='l2'
89-
)
90-
91-
with tf.name_scope('loss'):
92-
self.loss = tf.reduce_mean(tf.squared_difference(self.target, self.eval)) # TD_error = (r+gamma*V_next) - V_eval
93-
with tf.name_scope('train'):
67+
def __init__(self, sess, n_features, lr=0.01):
68+
self.sess = sess
69+
70+
self.state = tf.placeholder(tf.float32, [1, n_features], "state")
71+
self.v_next = tf.placeholder(tf.float32, [1, 1], name="v_next")
72+
self.r = tf.placeholder(tf.float32, name='r')
73+
74+
l1 = tf.layers.dense(
75+
inputs=self.state,
76+
units=20, # number of hidden units
77+
activation=tf.nn.relu,
78+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
79+
bias_initializer=tf.constant_initializer(0.1), # biases
80+
name='l1'
81+
)
82+
83+
self.v = tf.layers.dense(
84+
inputs=l1,
85+
units=1, # output units
86+
activation=None,
87+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
88+
bias_initializer=tf.constant_initializer(0.1), # biases
89+
name='V'
90+
)
91+
92+
with tf.variable_scope('squared_TD_error'):
93+
self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_next - self.v)
94+
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
95+
with tf.variable_scope('train'):
9496
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
9597

96-
def update(self, s, target):
97-
_, loss = self.sess.run([self.train_op, self.loss], {self.state: s, self.target: target})
98-
return loss
98+
def update(self, s, r, s_):
99+
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
99100

100-
def evaluate(self, s):
101-
return self.sess.run(self.eval, {self.state: s})[0, 0] # return a float
101+
v_next = self.sess.run(self.v, {self.state: s_})
102+
td_error, loss, _ = self.sess.run([self.td_error, self.loss, self.train_op],
103+
{self.state: s, self.v_next: v_next, self.r: r})
104+
return td_error, loss
102105

103106

104107
OUTPUT_GRAPH = False
@@ -110,53 +113,47 @@ def evaluate(self, s):
110113
env = gym.make('CartPole-v0')
111114
env.seed(1) # reproducible
112115

113-
actor = Actor(n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=0.001)
114-
critic = Critic(n_features=env.observation_space.shape[0], lr=0.01) # we need a good teacher, so the teacher should learn faster than the actor
116+
sess = tf.Session()
115117

116-
with tf.Session() as sess:
117-
if OUTPUT_GRAPH:
118-
tf.summary.FileWriter("logs/", sess.graph)
118+
with tf.variable_scope('Actor'):
119+
actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=0.001)
120+
with tf.variable_scope('Critic'):
121+
critic = Critic(sess, n_features=env.observation_space.shape[0], lr=0.01) # we need a good teacher, so the teacher should learn faster than the actor
119122

120-
actor.sess, critic.sess = sess, sess # define the tf session
121-
tf.global_variables_initializer().run()
123+
sess.run(tf.global_variables_initializer())
122124

123-
for i_episode in range(3000):
124-
observation = env.reset()
125-
t = 0
126-
track_r = []
127-
while True:
128-
if RENDER: env.render()
125+
if OUTPUT_GRAPH:
126+
tf.summary.FileWriter("logs/", sess.graph)
129127

130-
action = actor.choose_action(observation)
128+
for i_episode in range(3000):
129+
s = env.reset()
130+
t = 0
131+
track_r = []
132+
while True:
133+
if RENDER: env.render()
131134

132-
observation_, reward, done, info = env.step(action)
135+
a = actor.choose_action(s)
133136

134-
x, x_dot, theta, theta_dot = observation_
135-
# the smaller theta and closer to center, the better
136-
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.5
137-
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
138-
reward = r1 + r2
137+
s_, r, done, info = env.step(a)
139138

140-
track_r.append(reward)
139+
if done: r = -20
141140

142-
TD_target = reward + GAMMA * critic.evaluate(observation_) # r + gamma * V_next
143-
TD_eval = critic.evaluate(observation) # V_now
144-
TD_error = TD_target - TD_eval
141+
track_r.append(r)
145142

146-
actor.update(s=observation, a=action, adv=TD_error)
147-
critic.update(s=observation, target=TD_target)
143+
td_error, loss = critic.update(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
144+
actor.update(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
148145

149-
observation = observation_
150-
t += 1
146+
s = s_
147+
t += 1
151148

152-
if done or t >= EPISODE_TIME_THRESHOLD:
153-
ep_rs_sum = sum(track_r)
149+
if done or t >= EPISODE_TIME_THRESHOLD:
150+
ep_rs_sum = sum(track_r)
154151

155-
if 'running_reward' not in globals():
156-
running_reward = ep_rs_sum
157-
else:
158-
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
159-
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
160-
print("episode:", i_episode, " reward:", int(running_reward))
161-
break
152+
if 'running_reward' not in globals():
153+
running_reward = ep_rs_sum
154+
else:
155+
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
156+
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
157+
print("episode:", i_episode, " reward:", int(running_reward))
158+
break
162159

Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_continue_Pendulum.py

+26-31
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
44
The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb)
55
6+
Cannot converge!!!
7+
68
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
79
810
Using:
@@ -26,50 +28,43 @@ def __init__(self, n_features, action_range, lr=0.0001):
2628
self.act = tf.placeholder(tf.float32, name="act")
2729
self.advantage = tf.placeholder(tf.float32, name="adv") # TD_error
2830

29-
mu_ = tf.layers.dense(
31+
l1 = tf.layers.dense(
3032
inputs=state,
31-
units=40, # number of hidden units
32-
activation=tf.nn.relu,
33+
units=30, # number of hidden units
34+
activation=None,
3335
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
3436
bias_initializer=tf.constant_initializer(0.1), # biases
35-
name='mu_'
37+
name='l1'
3638
)
3739

3840
mu = tf.layers.dense(
39-
inputs=mu_,
41+
inputs=l1,
4042
units=1, # number of hidden units
41-
activation=None,
43+
activation=tf.nn.tanh,
4244
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
4345
bias_initializer=tf.constant_initializer(0.1), # biases
4446
name='mu'
4547
)
4648

47-
sigma_ = tf.layers.dense(
48-
inputs=state,
49-
units=40, # output units
50-
activation=None, # get action probabilities
51-
kernel_initializer=tf.random_normal_initializer(0., .3), # weights
52-
bias_initializer=tf.constant_initializer(0.3), # biases
53-
name='sigma_'
54-
)
5549
sigma = tf.layers.dense(
56-
inputs=sigma_,
50+
inputs=l1,
5751
units=1, # output units
58-
activation=tf.nn.softplus, # get action probabilities
59-
kernel_initializer=tf.random_normal_initializer(0., .3), # weights
60-
bias_initializer=tf.constant_initializer(.5), # biases
52+
activation=tf.nn.relu, # get action probabilities
53+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
54+
bias_initializer=tf.constant_initializer(1.), # biases
6155
name='sigma'
6256
)
6357

64-
self.mu, self.sigma = tf.squeeze(mu), tf.squeeze(sigma+1e-1)
58+
self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+1e-2)
6559
self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
6660

6761
self.action = tf.clip_by_value(self.normal_dist.sample(1), action_range[0], action_range[1])
6862

6963
with tf.name_scope('loss'):
7064
neg_log_prob = -self.normal_dist.log_prob(self.act) # loss without advantage
71-
self.loss = tf.reduce_mean(neg_log_prob * self.advantage) # advantage (TD_error) guided loss
72-
self.loss -= 3e-1 * self.normal_dist.entropy()
65+
self.loss = neg_log_prob * self.advantage # advantage (TD_error) guided loss
66+
# Add cross entropy cost to encourage exploration
67+
self.loss -= 1e-1 * self.normal_dist.entropy()
7368

7469
with tf.name_scope('train'):
7570
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
@@ -93,8 +88,8 @@ def __init__(self, n_features, lr=0.01):
9388
with tf.variable_scope('Critic'):
9489
l1 = tf.layers.dense(
9590
inputs=state,
96-
units=40, # number of hidden units
97-
activation=tf.nn.relu, # open end
91+
units=30, # number of hidden units
92+
activation=None,
9893
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
9994
bias_initializer=tf.constant_initializer(0.1), # biases
10095
name='l1'
@@ -112,7 +107,7 @@ def __init__(self, n_features, lr=0.01):
112107
with tf.name_scope('loss'):
113108
self.loss = tf.reduce_mean(tf.squared_difference(self.target, self.eval)) # TD_error = (r+gamma*V_next) - V_eval
114109
with tf.name_scope('train'):
115-
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
110+
self.train_op = tf.train.RMSPropOptimizer(lr).minimize(self.loss)
116111

117112
def update(self, s, target):
118113
_, loss = self.sess.run([self.train_op, self.loss], {self.state: s, self.target: target})
@@ -123,16 +118,16 @@ def evaluate(self, s):
123118

124119

125120
OUTPUT_GRAPH = False
126-
EPISODE_TIME_THRESHOLD = 100
121+
EPISODE_TIME_THRESHOLD = 300
127122
DISPLAY_REWARD_THRESHOLD = -550 # renders environment if total episode reward is greater then this threshold
128123
RENDER = False # rendering wastes time
129124
GAMMA = 0.9
130125

131126
env = gym.make('Pendulum-v0')
132127
# env.seed(1) # reproducible
133128

134-
actor = Actor(n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=0.0001)
135-
critic = Critic(n_features=env.observation_space.shape[0], lr=0.0001)
129+
actor = Actor(n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=0.001)
130+
critic = Critic(n_features=env.observation_space.shape[0], lr=0.002)
136131

137132
with tf.Session() as sess:
138133
if OUTPUT_GRAPH:
@@ -146,12 +141,12 @@ def evaluate(self, s):
146141
t = 0
147142
ep_rs = []
148143
while True:
149-
if RENDER: env.render()
144+
# if RENDER:
145+
env.render()
150146
action, mu, sigma = actor.choose_action(observation)
151147

152148
observation_, reward, done, info = env.step(action)
153-
# if reward > -2: reward = (reward+5)*2
154-
149+
reward /= 10
155150
TD_target = reward + GAMMA * critic.evaluate(observation_) # r + gamma * V_next
156151
TD_eval = critic.evaluate(observation) # V_now
157152
TD_error = TD_target - TD_eval
@@ -168,7 +163,7 @@ def evaluate(self, s):
168163
if 'running_reward' not in globals():
169164
running_reward = ep_rs_sum
170165
else:
171-
running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
166+
running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
172167
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
173168
print("episode:", i_episode, " reward:", int(running_reward))
174169
break

0 commit comments

Comments
 (0)