From 45ba6299180dd57dbf61c229c1a3c35a2a4e3891 Mon Sep 17 00:00:00 2001
From: namidairo777 <namidairo777@163.com>
Date: Mon, 18 Dec 2017 11:47:33 +0900
Subject: [PATCH] Thesis algorithm

---
 actorcriticv2.py                              | 26 ++++++++++++-
 main2.py                                      | 12 +++---
 .../multiagent/scenarios/simple_tag.py        |  2 +-
 tests/thesis.tex                              | 39 +++++++++++++------
 4 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/actorcriticv2.py b/actorcriticv2.py
index e5fe5d9..eedb6ee 100644
--- a/actorcriticv2.py
+++ b/actorcriticv2.py
@@ -140,10 +140,10 @@ def __init__(self,sess,num_agents,state_dim,action_dim,lr,tau,gamma):
 		self.tau = tau
 		self.num_agents = num_agents
 		self.gamma  =  gamma
-		self.mainModel,self.state,self.actions = self._build_hard2_model()
+		self.mainModel,self.state,self.actions = self._build_hard3_model()
 		self.mainModel._make_predict_function()
 		self.mainModel._make_train_function()
-		self.targetModel,_,_ = self._build_hard2_model()
+		self.targetModel,_,_ = self._build_hard3_model()
 		self.targetModel._make_predict_function()
 		self.action_grads  = tf.gradients(self.mainModel.output,self.actions)
 		self.sess.run(tf.global_variables_initializer())
@@ -209,6 +209,28 @@ def _build_hard2_model(self):
 		model.compile(optimizer='Adam',loss='mean_squared_error')
 		return model,input_obs,input_actions
 
+	def _build_hard3_model(self):
+		input_obs = Input(shape=(self.state_dim,))
+		input_actions = Input(shape=(self.action_dim,))
+		temp_obs = Dense(400)(input_obs)
+		obs = Activation('relu')(temp_obs)
+		temp_actions = Dense(400)(input_actions)
+		actions =  Activation('relu')(temp_actions)
+
+		#h = BatchNormalization()(h)
+		# action_abs = Dense(300)(input_actions)
+		# temp1 = Dense(300)(h)
+		#action_abs = Activation('relu')(action_abs)
+		#action_abs = BatchNormalization()(action_abs)
+		h = Add()([obs,actions])
+		h = Dense(300)(h)
+		h = Activation('relu')(h)
+		#h = BatchNormalization()(h)
+		pred = Dense(1,kernel_initializer='random_uniform')(h)
+		model = Model(inputs=[input_obs,input_actions],outputs=pred)
+		model.compile(optimizer='Adam',loss='mean_squared_error')
+		return model,input_obs,input_actions
+
 	# Simple Network model
 	def _build_simple_model(self):
 		input_obs = Input(shape=(self.state_dim,))
diff --git a/main2.py b/main2.py
index 84d1ce2..6f53ed4 100644
--- a/main2.py
+++ b/main2.py
@@ -2,12 +2,12 @@
 from gym import wrappers
 import make_env
 import numpy as np
-#import random
-#from ReplayMemory import ReplayMemory
+import random
+from ReplayMemory import ReplayMemory
 from ExplorationNoise import OrnsteinUhlenbeckActionNoise as OUNoise
 from actorcriticv2 import ActorNetwork,CriticNetwork
 #from actorcriticv1 import Brain, Worker
-# from Train import train
+from Train import train
 # from Distributed_Train import *
 import argparse
 from keras.models import load_model
@@ -482,7 +482,7 @@ def main(args):
                 print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
             
         else:
-            if False: 
+            if True: 
                 train(sess,env,args,actors,critics,exploration_noise, ave_n)
             else:
                 global graph, global_queue, update_event, rolling_event, global_step_max, global_step, coord, brain
@@ -551,8 +551,8 @@ def run():
     parser.add_argument('--render-env', help='render the gym env', action='store_true')
     parser.add_argument('--use-gym-monitor', help='record gym results', action='store_true')
     parser.add_argument('--monitor-dir', help='directory for storing gym results', default='./results/videos/video1')
-    parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/2vs1_distributed/tfdata/')
-    parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/good_weights/actor")
+    parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/3vs1_hard3/tfdata/')
+    parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/3vs1_hard3/weights/")
     parser.add_argument('--runTest', help='use saved model to run', default=True)
 
     parser.set_defaults(render_env=False)
diff --git a/multiagent-envs/multiagent/scenarios/simple_tag.py b/multiagent-envs/multiagent/scenarios/simple_tag.py
index c5ece2d..4c27dcc 100644
--- a/multiagent-envs/multiagent/scenarios/simple_tag.py
+++ b/multiagent-envs/multiagent/scenarios/simple_tag.py
@@ -9,7 +9,7 @@ def make_world(self):
         # set any world properties first
         world.dim_c = 2
         num_good_agents = 1
-        num_adversaries = 2
+        num_adversaries = 3
         num_agents = num_adversaries + num_good_agents
         num_landmarks = 1
         num_borders = 80 # (20 * 4)
diff --git a/tests/thesis.tex b/tests/thesis.tex
index 20fb692..a94aea0 100644
--- a/tests/thesis.tex
+++ b/tests/thesis.tex
@@ -168,17 +168,19 @@ \section{Proposed Methods}[still need more time on section 3]
 In traditional DRL methods like DQN or DDPG, we use replay memory to solve the problem that there are close connections between transitions. When training, add transition into repley memory, then when updating network, sample batches from replay memory with a fixed size (64 or 128). However, along with the network updating, old experiences seems less meaningful for learning comparing with batches which were gotten from recent policy. 
 In our Distributed method, we use several (2....n) workers to work for us, in charge of collecting batch data. Each worker runs on different threading with same environment (different random noise for each worker), collection (state, actions, rewards, done, next state) then pushing into a fixed-size memory (64 or 128). When memory is full, our brain starts to update using this batch data. Because we have different work working on parallel world, connection problem between batch data is solved in our method.  
 
-\subsection{Distributed Centralized Critic and decentralized Actor}
+\subsection{Centralized Critic and decentralized Actor}
 [MADDPG]
-Recently, OpenAI rleased a method which extends traditional DDPG method to multi-agent domain \cite{4}. As we know, single-agent algorithm failed because while agent is updating policy, the environment becomes non-stationary which turns out to failure of convergence. Multi-Agent DDPG found a centralized way to put other agents's actions into consideration in critic.
+Recently, OpenAI rleased a method which extends traditional DDPG method to multi-agent domain \cite{4}. As we know, single-agent algorithm failed because while agent is updating policy, the environment becomes non-stationary which turns out to failure of convergence. \textbf{Multi-Agent DDPG} found a centralized way to put other agents's actions into consideration in critic.
 \begin{equation}
 L(\theta_i) = E_{s,a,r,s'}[(Q^*(s, a_1, a_2 ... a_n) - y)^2],  
 \end{equation}
 $$where\ y = r_i + \gamma{Q_i}^*(s', a_1', a_2' ... a_n') | _{a_j'=\mu_j'(o_j)}$$
 This is a great iead to learn using centralized critic and act using decentralized actor. 
-[Explain on why env becomes stationary?]
-However, MADDPG adopted the same network from DDPG, due the multiple agent number, the learning becomes slow for heavy computation. In the meantime, it is difficult to choose a good learning rate.
 
+Based on the centralized critic idea from MADDPG, we introduce our two approaches towards multi-agent task in Predator-Prey environment.
+
+
+\subsection{Distributed Method based on action-value function as appro}
 As we mentioned in 2.6, we got inspiration from MADDPG which is a multi-agent extension of DDPG. we could extend this centralized thought to critic using advantage (TD-error).
 \begin{equation}
 A(s_t, a_t; \theta, \theta_v) = \sum_{i=0}^{k-1}(\gamma^i r_{t+i} + \gamma^k V(s_{t+k};\theta_v)) - V(s_t; \theta_v).
@@ -202,15 +204,30 @@ \subsection{Distributed Multi-Agent Cooperation Architecture}
 There is brain and several workers in our architecture. Fu We use several workers to work in parallel world to collect batch data with low connectivity. In each worker,  we use brain's policy to choose action, then we give this action (using some noise) to environment and get the batch data (state, actions, rewards, done, next state). And we store this data in to a exchange memory which could accessed by workers (working in each thread). When we have stored batch up to a predifined size, workers stop collecting data from each environment. Meantime, our brain starts to update network using batch data.
 
 \begin{algorithm*}
-\caption{Distributed Multi-Agent Cooperation Algorithm}
+\caption{Distributed Worker on each thread Algorithm}
+\begin{algorithmic}
+\State {Initialize a random process $\mathbb{N}$ for action exploration}
+\State {Receive initial state $s$}
+\For {$t = 1$ to max-episode-length}
+  \State {for each agent $i$, select action $a_i = \pi_i(o_i) + \mathbb{N}_t$ w.r.t the current policy and exploration}
+  \State {Execute actions $a = (a_1 ... a_n)$ and observe reward $r = (r_1 ... r_n)$ for each agent and new state $s'$}
+  \If {Queue size is not equal to max-batch-size}
+    \State {Store $(s, a, r, s')$ in Queue which could be accessed by other thread}
+  \EndIf
+\EndFor
+\end{algorithmic}
+\end{algorithm*}
+
+\begin{algorithm*}
+\caption{Distributed Multi-Agent Cooperation Algorithm based action-value function}
 \begin{algorithmic}
-\State {Initiate brain and n workers}
-\State {}
-\State $New Graph (abstraction level+1)$
-\State $Define a queue$
-\State {$Initiate\ queue\ with\ head\ node\ pointer\ of\ graph$}
+\State $Initiate brain and n workers for n agents$
+\State $$
 \For {$episode = 1 to M$}
-  \State {$Do something$}
+  \State {$$}
+  \For {$t = 1$ to max-episode-length}
+    \State {Get batch data $(s, a, r, s')$ from Queue}
+  \EndFor
 \EndFor
 \While {$queue \neq empty$}
     \State $node1 \gets queue.pop()$