Skip to content

Commit ae199e7

Browse files
committed
edited
1 parent 1b9cb6f commit ae199e7

File tree

4 files changed

+17
-17
lines changed

4 files changed

+17
-17
lines changed

Reinforcement_learning_TUT/7_Policy_gradient_softmax/RL_brain.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -52,25 +52,25 @@ def _build_net(self):
5252
layer = tf.layers.dense(
5353
inputs=self.tf_obs,
5454
units=10,
55-
activation=,
56-
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.2),
57-
bias_initializer=tf.constant_initializer(0.01),
55+
activation=tf.nn.tanh, # tanh activation
56+
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
57+
bias_initializer=tf.constant_initializer(0.1),
5858
name='fc1'
5959
)
6060
# fc2
61-
self.all_act = tf.layers.dense(
61+
all_act = tf.layers.dense(
6262
inputs=layer,
6363
units=self.n_actions,
64-
activation=tf.nn.softmax,
65-
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.2),
66-
bias_initializer=tf.constant_initializer(0.01),
64+
activation=None,
65+
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
66+
bias_initializer=tf.constant_initializer(0.1),
6767
name='fc2'
6868
)
6969

70-
self.all_act_prob = tf.nn.softmax(self.all_act) # convert to probability
70+
self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability
7171

7272
with tf.name_scope('loss'):
73-
log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.all_act, labels=self.tf_acts) # this is negative log
73+
log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log
7474
loss = tf.reduce_mean(log_prob * self.tf_vt) # reward guided loss
7575

7676
with tf.name_scope('train'):

Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_CartPole.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
RENDER = False # rendering wastes time
1515

1616
env = gym.make('CartPole-v0')
17-
# env.seed(2) # reproducible, general Policy gradient has high variance
17+
env.seed(1) # reproducible, general Policy gradient has high variance
1818

1919
print(env.action_space)
2020
print(env.observation_space)
@@ -24,9 +24,9 @@
2424
RL = PolicyGradient(
2525
n_actions=env.action_space.n,
2626
n_features=len(env.observation_space.high),
27-
learning_rate=0.01,
27+
learning_rate=0.02,
2828
reward_decay=0.99,
29-
output_graph=True,
29+
# output_graph=True,
3030
)
3131

3232
for i_episode in range(3000):

Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_MountainCar.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from RL_brain import PolicyGradient
1111
import matplotlib.pyplot as plt
1212

13-
DISPLAY_REWARD_THRESHOLD = -10000 # renders environment if total episode reward is greater then this threshold
13+
DISPLAY_REWARD_THRESHOLD = -2000 # renders environment if total episode reward is greater then this threshold
1414
RENDER = False # rendering wastes time
1515

1616
env = gym.make('MountainCar-v0')
@@ -24,12 +24,12 @@
2424
RL = PolicyGradient(
2525
n_actions=env.action_space.n,
2626
n_features=len(env.observation_space.high),
27-
learning_rate=0.02,
28-
reward_decay=0.99,
27+
learning_rate=0.01,
28+
reward_decay=0.995,
2929
# output_graph=True,
3030
)
3131

32-
for i_episode in range(3000):
32+
for i_episode in range(500):
3333

3434
observation = env.reset()
3535

tensorflowTUT/tensorflow8_feeds.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
input1 = tf.placeholder(tf.float32)
1313
input2 = tf.placeholder(tf.float32)
14-
ouput = tf.mul(input1, input2)
14+
ouput = tf.multiply(input1, input2)
1515

1616
with tf.Session() as sess:
1717
print(sess.run(ouput, feed_dict={input1: [7.], input2: [2.]}))

0 commit comments

Comments
 (0)