zwblst007
diff --git a/‎Reinforcement_learning_TUT/5_Deep_Q_Network/RL_brain.py
+4-2 b/‎Reinforcement_learning_TUT/5_Deep_Q_Network/RL_brain.py
+4-2
diff --git a/‎Reinforcement_learning_TUT/6_OpenAI_gym/RL_brain.py
+94-90 b/‎Reinforcement_learning_TUT/6_OpenAI_gym/RL_brain.py
+94-90
diff --git a/‎Reinforcement_learning_TUT/6_OpenAI_gym/run_CartPole.py
+3-4 b/‎Reinforcement_learning_TUT/6_OpenAI_gym/run_CartPole.py
+3-4
@@ -1,6 +1,7 @@
 """
 This part of code is the Q learning brain, which is a brain of the agent.
 All decisions are made in here.
+Using Tensorflow to build the neural network.
 
 View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
 """
@@ -197,8 +198,8 @@ def learn(self):
 
         Then change q_target with the real q_target value w.r.t the q_eval's action.
         For example in:
-            sample 0, I took action 0, and the q_target value is -1;
-            sample 1, I took action 2, and the q_target value is -2:
+            sample 0, I took action 0, and the max q_target value is -1;
+            sample 1, I took action 2, and the max q_target value is -2:
         q_target =
         [[-1, 2, 3],
          [4, 5, -2]]
@@ -221,6 +222,7 @@ def learn(self):
         self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
         self.learn_step_counter += 1
 
+
     def plot_cost(self):
         import matplotlib.pyplot as plt
         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
 
@@ -1,6 +1,7 @@
 """
 This part of code is the Q learning brain, which is a brain of the agent.
 All decisions are made in here.
+Using Tensorflow to build the neural network.
 
 View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
 """
@@ -60,6 +61,90 @@ def __init__(
         self.sess.run(tf.global_variables_initializer())
         self.cost_his = []
 
+    def _build_net(self):
+        # create eval and target net weights and biases separately
+        self._eval_net_params = []
+        self._target_net_params = []
+
+        # build evaluate_net
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
+        with tf.variable_scope('eval_net'):
+            self.q_eval = self._build_layers(self.s, self.n_actions, trainable=True)
+            with tf.name_scope('loss'):
+                self.loss = tf.reduce_sum(tf.square(self.q_target - self.q_eval))
+            with tf.name_scope('train'):
+                self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # build target_net
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')
+        with tf.variable_scope('target_net'):
+            self.q_next = self._build_layers(self.s_, self.n_actions, trainable=False)
+
+    def _build_layers(self, inputs, action_size, trainable):
+        layers_output = [inputs]
+        for i, n_unit in enumerate(self.hidden_layers):
+            with tf.variable_scope('layer%i' % i):
+                output = self._add_layer(
+                    layers_output[i],
+                    in_size=layers_output[i].get_shape()[1].value,
+                    out_size=n_unit,
+                    activation_function=tf.nn.relu,
+                    trainable=trainable,
+                )
+                layers_output.append(output)
+        with tf.variable_scope('output_layer'):
+            output = self._add_layer(
+                layers_output[-1],
+                in_size=layers_output[-1].get_shape()[1].value,
+                out_size=action_size,
+                activation_function=None,
+                trainable=trainable
+            )
+        return output
+
+    def _add_layer(self, inputs, in_size, out_size, activation_function=None, trainable=True):
+        # create weights and biases
+        Weights = tf.get_variable(
+            name='weights',
+            shape=[in_size, out_size],
+            trainable=trainable,
+            initializer=tf.truncated_normal_initializer(mean=0., stddev=0.3)
+        )
+        biases = tf.get_variable(
+            name='biases',
+            shape=[out_size],
+            initializer=tf.constant_initializer(0.1),
+            trainable=trainable
+        )
+
+        # record parameters
+        if trainable is True:
+            self._eval_net_params.append([Weights, biases])
+        else:
+            self._target_net_params.append([Weights, biases])
+
+        Wx_plus_b = tf.matmul(inputs, Weights) + biases
+
+        # activation function
+        if activation_function is None:
+            outputs = Wx_plus_b
+        else:
+            outputs = activation_function(Wx_plus_b)
+        return outputs
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+
+        transition = np.hstack((s, [a, r], s_))
+
+        # replace the old memory with new memory
+        index = self.memory_counter % self.memory_size
+        self.memory.iloc[index, :] = transition
+
+        self.memory_counter += 1
+
     def choose_action(self, observation):
         # to have batch dimension when feed into tf placeholder
         observation = observation[np.newaxis, :]
@@ -72,6 +157,13 @@ def choose_action(self, observation):
             action = np.random.randint(0, self.n_actions)
         return action
 
+    def _replace_target_params(self):
+        replace_ops = []
+        for layer, params in enumerate(self._eval_net_params):
+            replace_op = [tf.assign(self._target_net_params[layer][W_b], params[W_b]) for W_b in range(2)]
+            replace_ops.append(replace_op)
+        self.sess.run(replace_ops)
+
     def learn(self):
         # check to replace target parameters
         if self.learn_step_counter % self.replace_target_iter == 0:
@@ -106,8 +198,8 @@ def learn(self):
 
         Then change q_target with the real q_target value w.r.t the q_eval's action.
         For example in:
-            sample 0, I took action 0, and the max q_target value is -1;
-            sample 1, I took action 2, and the max q_target value is -2:
+            sample 0, I took action 0, and the q_target value is -1;
+            sample 1, I took action 2, and the q_target value is -2:
         q_target =
         [[-1, 2, 3],
          [4, 5, -2]]
@@ -130,98 +222,10 @@ def learn(self):
         self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
         self.learn_step_counter += 1
 
-    def store_transition(self, s, a, r, s_):
-        if not hasattr(self, 'memory_counter'):
-            self.memory_counter = 0
-
-        transition = np.hstack((s, [a, r], s_))
-
-        # replace the old memory with new memory
-        index = self.memory_counter % self.memory_size
-        self.memory.iloc[index, :] = transition
-
-        self.memory_counter += 1
-
     def plot_cost(self):
         import matplotlib.pyplot as plt
         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
         plt.show()
 
-    def _replace_target_params(self):
-        replace_ops = []
-        for layer, params in enumerate(self._eval_net_params):
-            replace_op = [tf.assign(self._target_net_params[layer][W_b], params[W_b]) for W_b in range(2)]
-            replace_ops.append(replace_op)
-        self.sess.run(replace_ops)
 
-    def _build_net(self):
-        # create eval and target net weights and biases separately
-        self._eval_net_params = []
-        self._target_net_params = []
 
-        # build evaluate_net
-        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
-        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
-        with tf.variable_scope('eval_net'):
-            self.q_eval = self._build_layers(self.s, self.n_actions, trainable=True)
-            with tf.name_scope('loss'):
-                self.loss = tf.reduce_sum(tf.square(self.q_target - self.q_eval))
-            with tf.name_scope('train'):
-                self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
-
-        # build target_net
-        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')
-        with tf.variable_scope('target_net'):
-            self.q_next = self._build_layers(self.s_, self.n_actions, trainable=False)
-
-    def _build_layers(self, inputs, action_size, trainable):
-        layers_output = [inputs]
-        for i, n_unit in enumerate(self.hidden_layers):
-            with tf.variable_scope('layer%i' % i):
-                output = self._add_layer(
-                    layers_output[i],
-                    in_size=layers_output[i].get_shape()[1].value,
-                    out_size=n_unit,
-                    activation_function=tf.nn.relu,
-                    trainable=trainable,
-                )
-                layers_output.append(output)
-        with tf.variable_scope('output_layer'):
-            output = self._add_layer(
-                layers_output[-1],
-                in_size=layers_output[-1].get_shape()[1].value,
-                out_size=action_size,
-                activation_function=None,
-                trainable=trainable
-            )
-        return output
-
-    def _add_layer(self, inputs, in_size, out_size, activation_function=None, trainable=True):
-        # create weights and biases
-        Weights = tf.get_variable(
-            name='weights',
-            shape=[in_size, out_size],
-            trainable=trainable,
-            initializer=tf.truncated_normal_initializer(mean=0., stddev=0.3)
-        )
-        biases = tf.get_variable(
-            name='biases',
-            shape=[out_size],
-            initializer=tf.constant_initializer(0.1),
-            trainable=trainable
-        )
-
-        # record parameters
-        if trainable is True:
-            self._eval_net_params.append([Weights, biases])
-        else:
-            self._target_net_params.append([Weights, biases])
-
-        Wx_plus_b = tf.matmul(inputs, Weights) + biases
-
-        # activation function
-        if activation_function is None:
-            outputs = Wx_plus_b
-        else:
-            outputs = activation_function(Wx_plus_b)
-        return outputs
 
@@ -7,7 +7,6 @@
 
 import gym
 from RL_brain import DeepQNetwork
-import time
 
 env = gym.make('CartPole-v0')
 print(env.action_space)
@@ -17,7 +16,7 @@
 
 RL = DeepQNetwork(n_actions=env.action_space.n,
                   n_features=len(env.observation_space.high),
-                  learning_rate=0.01, e_greedy=0.99,
+                  learning_rate=0.01, e_greedy=0.9,
                   replace_target_iter=100, memory_size=2000,
                   e_greedy_increment=0.001,
                   hidden_layers=[20, 20],)
@@ -40,8 +39,8 @@
 
         # the smaller theta and closer to center the better
 
-        r1 = (env.x_threshold - abs(x))/env.x_threshold - 2
-        r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians
+        r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
+        r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
         reward = r1 + r2
 
         RL.store_transition(observation, action, reward, observation_)