Thesis algorithm

helloxss · Dec 18, 2017 · 45ba629 · 45ba629
1 parent a09aaa1
commit 45ba629
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 20 deletions.
diff --git a/actorcriticv2.py b/actorcriticv2.py
@@ -140,10 +140,10 @@ def __init__(self,sess,num_agents,state_dim,action_dim,lr,tau,gamma):
 		self.tau = tau
 		self.num_agents = num_agents
 		self.gamma  =  gamma
-		self.mainModel,self.state,self.actions = self._build_hard2_model()
+		self.mainModel,self.state,self.actions = self._build_hard3_model()
 		self.mainModel._make_predict_function()
 		self.mainModel._make_train_function()
-		self.targetModel,_,_ = self._build_hard2_model()
+		self.targetModel,_,_ = self._build_hard3_model()
 		self.targetModel._make_predict_function()
 		self.action_grads  = tf.gradients(self.mainModel.output,self.actions)
 		self.sess.run(tf.global_variables_initializer())
@@ -209,6 +209,28 @@ def _build_hard2_model(self):
 		model.compile(optimizer='Adam',loss='mean_squared_error')
 		return model,input_obs,input_actions
 
+	def _build_hard3_model(self):
+		input_obs = Input(shape=(self.state_dim,))
+		input_actions = Input(shape=(self.action_dim,))
+		temp_obs = Dense(400)(input_obs)
+		obs = Activation('relu')(temp_obs)
+		temp_actions = Dense(400)(input_actions)
+		actions =  Activation('relu')(temp_actions)
+
+		#h = BatchNormalization()(h)
+		# action_abs = Dense(300)(input_actions)
+		# temp1 = Dense(300)(h)
+		#action_abs = Activation('relu')(action_abs)
+		#action_abs = BatchNormalization()(action_abs)
+		h = Add()([obs,actions])
+		h = Dense(300)(h)
+		h = Activation('relu')(h)
+		#h = BatchNormalization()(h)
+		pred = Dense(1,kernel_initializer='random_uniform')(h)
+		model = Model(inputs=[input_obs,input_actions],outputs=pred)
+		model.compile(optimizer='Adam',loss='mean_squared_error')
+		return model,input_obs,input_actions
+
 	# Simple Network model
 	def _build_simple_model(self):
 		input_obs = Input(shape=(self.state_dim,))

diff --git a/main2.py b/main2.py
@@ -2,12 +2,12 @@
 from gym import wrappers
 import make_env
 import numpy as np
-#import random
-#from ReplayMemory import ReplayMemory
+import random
+from ReplayMemory import ReplayMemory
 from ExplorationNoise import OrnsteinUhlenbeckActionNoise as OUNoise
 from actorcriticv2 import ActorNetwork,CriticNetwork
 #from actorcriticv1 import Brain, Worker
-# from Train import train
+from Train import train
 # from Distributed_Train import *
 import argparse
 from keras.models import load_model
@@ -482,7 +482,7 @@ def main(args):
                 print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
 
         else:
-            if False: 
+            if True: 
                 train(sess,env,args,actors,critics,exploration_noise, ave_n)
             else:
                 global graph, global_queue, update_event, rolling_event, global_step_max, global_step, coord, brain
@@ -551,8 +551,8 @@ def run():
     parser.add_argument('--render-env', help='render the gym env', action='store_true')
     parser.add_argument('--use-gym-monitor', help='record gym results', action='store_true')
     parser.add_argument('--monitor-dir', help='directory for storing gym results', default='./results/videos/video1')
-    parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/2vs1_distributed/tfdata/')
-    parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/good_weights/actor")
+    parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/3vs1_hard3/tfdata/')
+    parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/3vs1_hard3/weights/")
     parser.add_argument('--runTest', help='use saved model to run', default=True)
 
     parser.set_defaults(render_env=False)

diff --git a/multiagent-envs/multiagent/scenarios/simple_tag.py b/multiagent-envs/multiagent/scenarios/simple_tag.py
@@ -9,7 +9,7 @@ def make_world(self):
         # set any world properties first
         world.dim_c = 2
         num_good_agents = 1
-        num_adversaries = 2
+        num_adversaries = 3
         num_agents = num_adversaries + num_good_agents
         num_landmarks = 1
         num_borders = 80 # (20 * 4)

diff --git a/tests/thesis.tex b/tests/thesis.tex
@@ -168,17 +168,19 @@ \section{Proposed Methods}[still need more time on section 3]
 In traditional DRL methods like DQN or DDPG, we use replay memory to solve the problem that there are close connections between transitions. When training, add transition into repley memory, then when updating network, sample batches from replay memory with a fixed size (64 or 128). However, along with the network updating, old experiences seems less meaningful for learning comparing with batches which were gotten from recent policy. 
 In our Distributed method, we use several (2....n) workers to work for us, in charge of collecting batch data. Each worker runs on different threading with same environment (different random noise for each worker), collection (state, actions, rewards, done, next state) then pushing into a fixed-size memory (64 or 128). When memory is full, our brain starts to update using this batch data. Because we have different work working on parallel world, connection problem between batch data is solved in our method.  
 
-\subsection{Distributed Centralized Critic and decentralized Actor}
+\subsection{Centralized Critic and decentralized Actor}
 [MADDPG]
-Recently, OpenAI rleased a method which extends traditional DDPG method to multi-agent domain \cite{4}. As we know, single-agent algorithm failed because while agent is updating policy, the environment becomes non-stationary which turns out to failure of convergence. Multi-Agent DDPG found a centralized way to put other agents's actions into consideration in critic.
+Recently, OpenAI rleased a method which extends traditional DDPG method to multi-agent domain \cite{4}. As we know, single-agent algorithm failed because while agent is updating policy, the environment becomes non-stationary which turns out to failure of convergence. \textbf{Multi-Agent DDPG} found a centralized way to put other agents's actions into consideration in critic.
 \begin{equation}
 L(\theta_i) = E_{s,a,r,s'}[(Q^*(s, a_1, a_2 ... a_n) - y)^2],  
 \end{equation}
 $$where\ y = r_i + \gamma{Q_i}^*(s', a_1', a_2' ... a_n') | _{a_j'=\mu_j'(o_j)}$$
 This is a great iead to learn using centralized critic and act using decentralized actor. 
-[Explain on why env becomes stationary?]
-However, MADDPG adopted the same network from DDPG, due the multiple agent number, the learning becomes slow for heavy computation. In the meantime, it is difficult to choose a good learning rate.
 
+Based on the centralized critic idea from MADDPG, we introduce our two approaches towards multi-agent task in Predator-Prey environment.
+
+
+\subsection{Distributed Method based on action-value function as appro}
 As we mentioned in 2.6, we got inspiration from MADDPG which is a multi-agent extension of DDPG. we could extend this centralized thought to critic using advantage (TD-error).
 \begin{equation}
 A(s_t, a_t; \theta, \theta_v) = \sum_{i=0}^{k-1}(\gamma^i r_{t+i} + \gamma^k V(s_{t+k};\theta_v)) - V(s_t; \theta_v).
@@ -202,15 +204,30 @@ \subsection{Distributed Multi-Agent Cooperation Architecture}
 There is brain and several workers in our architecture. Fu We use several workers to work in parallel world to collect batch data with low connectivity. In each worker,  we use brain's policy to choose action, then we give this action (using some noise) to environment and get the batch data (state, actions, rewards, done, next state). And we store this data in to a exchange memory which could accessed by workers (working in each thread). When we have stored batch up to a predifined size, workers stop collecting data from each environment. Meantime, our brain starts to update network using batch data.
 
 \begin{algorithm*}
-\caption{Distributed Multi-Agent Cooperation Algorithm}
+\caption{Distributed Worker on each thread Algorithm}
+\begin{algorithmic}
+\State {Initialize a random process $\mathbb{N}$ for action exploration}
+\State {Receive initial state $s$}
+\For {$t = 1$ to max-episode-length}
+  \State {for each agent $i$, select action $a_i = \pi_i(o_i) + \mathbb{N}_t$ w.r.t the current policy and exploration}
+  \State {Execute actions $a = (a_1 ... a_n)$ and observe reward $r = (r_1 ... r_n)$ for each agent and new state $s'$}
+  \If {Queue size is not equal to max-batch-size}
+    \State {Store $(s, a, r, s')$ in Queue which could be accessed by other thread}
+  \EndIf
+\EndFor
+\end{algorithmic}
+\end{algorithm*}
+
+\begin{algorithm*}
+\caption{Distributed Multi-Agent Cooperation Algorithm based action-value function}
 \begin{algorithmic}
-\State {Initiate brain and n workers}
-\State {}
-\State $New Graph (abstraction level+1)$
-\State $Define a queue$
-\State {$Initiate\ queue\ with\ head\ node\ pointer\ of\ graph$}
+\State $Initiate brain and n workers for n agents$
+\State $$
 \For {$episode = 1 to M$}
-  \State {$Do something$}
+  \State {$$}
+  \For {$t = 1$ to max-episode-length}
+    \State {Get batch data $(s, a, r, s')$ from Queue}
+  \EndFor
 \EndFor
 \While {$queue \neq empty$}
     \State $node1 \gets queue.pop()$