edited

MorvanZhou · MorvanZhou · commit ae199e71a48e · 2017-02-24T00:28:48.000+11:00
diff --git a/Reinforcement_learning_TUT/7_Policy_gradient_softmax/RL_brain.py b/Reinforcement_learning_TUT/7_Policy_gradient_softmax/RL_brain.py
@@ -52,25 +52,25 @@ def _build_net(self):
         layer = tf.layers.dense(
             inputs=self.tf_obs,
             units=10,
-            activation=,
-            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.2),
-            bias_initializer=tf.constant_initializer(0.01),
+            activation=tf.nn.tanh,  # tanh activation
+            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
+            bias_initializer=tf.constant_initializer(0.1),
             name='fc1'
         )
         # fc2
-        self.all_act = tf.layers.dense(
+        all_act = tf.layers.dense(
             inputs=layer,
             units=self.n_actions,
-            activation=tf.nn.softmax,
-            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.2),
-            bias_initializer=tf.constant_initializer(0.01),
+            activation=None,
+            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
+            bias_initializer=tf.constant_initializer(0.1),
             name='fc2'
         )
 
-        self.all_act_prob = tf.nn.softmax(self.all_act)  # convert to probability
+        self.all_act_prob = tf.nn.softmax(all_act, name='act_prob')  # use softmax to convert to probability
 
         with tf.name_scope('loss'):
-            log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.all_act, labels=self.tf_acts)   # this is negative log
+            log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts)   # this is negative log
             loss = tf.reduce_mean(log_prob * self.tf_vt)  # reward guided loss
 
         with tf.name_scope('train'):
diff --git a/Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_CartPole.py b/Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_CartPole.py
@@ -14,7 +14,7 @@
 RENDER = False  # rendering wastes time
 
 env = gym.make('CartPole-v0')
-# env.seed(2)     # reproducible, general Policy gradient has high variance
+env.seed(1)     # reproducible, general Policy gradient has high variance
 
 print(env.action_space)
 print(env.observation_space)
@@ -24,9 +24,9 @@
 RL = PolicyGradient(
     n_actions=env.action_space.n,
     n_features=len(env.observation_space.high),
-    learning_rate=0.01,
+    learning_rate=0.02,
     reward_decay=0.99,
-    output_graph=True,
+    # output_graph=True,
 )
 
 for i_episode in range(3000):
diff --git a/Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_MountainCar.py b/Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_MountainCar.py
@@ -10,7 +10,7 @@
 from RL_brain import PolicyGradient
 import matplotlib.pyplot as plt
 
-DISPLAY_REWARD_THRESHOLD = -10000  # renders environment if total episode reward is greater then this threshold
+DISPLAY_REWARD_THRESHOLD = -2000  # renders environment if total episode reward is greater then this threshold
 RENDER = False  # rendering wastes time
 
 env = gym.make('MountainCar-v0')
@@ -24,12 +24,12 @@
 RL = PolicyGradient(
     n_actions=env.action_space.n,
     n_features=len(env.observation_space.high),
-    learning_rate=0.02,
-    reward_decay=0.99,
+    learning_rate=0.01,
+    reward_decay=0.995,
     # output_graph=True,
 )
 
-for i_episode in range(3000):
+for i_episode in range(500):
 
     observation = env.reset()
 
diff --git a/tensorflowTUT/tensorflow8_feeds.py b/tensorflowTUT/tensorflow8_feeds.py
@@ -11,7 +11,7 @@
 
 input1 = tf.placeholder(tf.float32)
 input2 = tf.placeholder(tf.float32)
-ouput = tf.mul(input1, input2)
+ouput = tf.multiply(input1, input2)
 
 with tf.Session() as sess:
     print(sess.run(ouput, feed_dict={input1: [7.], input2: [2.]}))