Skip to content

Commit

Permalink
Thesis algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
namidairo777 committed Dec 18, 2017
1 parent a09aaa1 commit 45ba629
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 20 deletions.
26 changes: 24 additions & 2 deletions actorcriticv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ def __init__(self,sess,num_agents,state_dim,action_dim,lr,tau,gamma):
self.tau = tau
self.num_agents = num_agents
self.gamma = gamma
self.mainModel,self.state,self.actions = self._build_hard2_model()
self.mainModel,self.state,self.actions = self._build_hard3_model()
self.mainModel._make_predict_function()
self.mainModel._make_train_function()
self.targetModel,_,_ = self._build_hard2_model()
self.targetModel,_,_ = self._build_hard3_model()
self.targetModel._make_predict_function()
self.action_grads = tf.gradients(self.mainModel.output,self.actions)
self.sess.run(tf.global_variables_initializer())
Expand Down Expand Up @@ -209,6 +209,28 @@ def _build_hard2_model(self):
model.compile(optimizer='Adam',loss='mean_squared_error')
return model,input_obs,input_actions

def _build_hard3_model(self):
input_obs = Input(shape=(self.state_dim,))
input_actions = Input(shape=(self.action_dim,))
temp_obs = Dense(400)(input_obs)
obs = Activation('relu')(temp_obs)
temp_actions = Dense(400)(input_actions)
actions = Activation('relu')(temp_actions)

#h = BatchNormalization()(h)
# action_abs = Dense(300)(input_actions)
# temp1 = Dense(300)(h)
#action_abs = Activation('relu')(action_abs)
#action_abs = BatchNormalization()(action_abs)
h = Add()([obs,actions])
h = Dense(300)(h)
h = Activation('relu')(h)
#h = BatchNormalization()(h)
pred = Dense(1,kernel_initializer='random_uniform')(h)
model = Model(inputs=[input_obs,input_actions],outputs=pred)
model.compile(optimizer='Adam',loss='mean_squared_error')
return model,input_obs,input_actions

# Simple Network model
def _build_simple_model(self):
input_obs = Input(shape=(self.state_dim,))
Expand Down
12 changes: 6 additions & 6 deletions main2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from gym import wrappers
import make_env
import numpy as np
#import random
#from ReplayMemory import ReplayMemory
import random
from ReplayMemory import ReplayMemory
from ExplorationNoise import OrnsteinUhlenbeckActionNoise as OUNoise
from actorcriticv2 import ActorNetwork,CriticNetwork
#from actorcriticv1 import Brain, Worker
# from Train import train
from Train import train
# from Distributed_Train import *
import argparse
from keras.models import load_model
Expand Down Expand Up @@ -482,7 +482,7 @@ def main(args):
print("Episode: {:d} | Reward: {:f}".format(ep, reward))

else:
if False:
if True:
train(sess,env,args,actors,critics,exploration_noise, ave_n)
else:
global graph, global_queue, update_event, rolling_event, global_step_max, global_step, coord, brain
Expand Down Expand Up @@ -551,8 +551,8 @@ def run():
parser.add_argument('--render-env', help='render the gym env', action='store_true')
parser.add_argument('--use-gym-monitor', help='record gym results', action='store_true')
parser.add_argument('--monitor-dir', help='directory for storing gym results', default='./results/videos/video1')
parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/2vs1_distributed/tfdata/')
parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/good_weights/actor")
parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/3vs1_hard3/tfdata/')
parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/3vs1_hard3/weights/")
parser.add_argument('--runTest', help='use saved model to run', default=True)

parser.set_defaults(render_env=False)
Expand Down
2 changes: 1 addition & 1 deletion multiagent-envs/multiagent/scenarios/simple_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def make_world(self):
# set any world properties first
world.dim_c = 2
num_good_agents = 1
num_adversaries = 2
num_adversaries = 3
num_agents = num_adversaries + num_good_agents
num_landmarks = 1
num_borders = 80 # (20 * 4)
Expand Down
39 changes: 28 additions & 11 deletions tests/thesis.tex
Original file line number Diff line number Diff line change
Expand Up @@ -168,17 +168,19 @@ \section{Proposed Methods}[still need more time on section 3]
In traditional DRL methods like DQN or DDPG, we use replay memory to solve the problem that there are close connections between transitions. When training, add transition into repley memory, then when updating network, sample batches from replay memory with a fixed size (64 or 128). However, along with the network updating, old experiences seems less meaningful for learning comparing with batches which were gotten from recent policy.
In our Distributed method, we use several (2....n) workers to work for us, in charge of collecting batch data. Each worker runs on different threading with same environment (different random noise for each worker), collection (state, actions, rewards, done, next state) then pushing into a fixed-size memory (64 or 128). When memory is full, our brain starts to update using this batch data. Because we have different work working on parallel world, connection problem between batch data is solved in our method.

\subsection{Distributed Centralized Critic and decentralized Actor}
\subsection{Centralized Critic and decentralized Actor}
[MADDPG]
Recently, OpenAI rleased a method which extends traditional DDPG method to multi-agent domain \cite{4}. As we know, single-agent algorithm failed because while agent is updating policy, the environment becomes non-stationary which turns out to failure of convergence. Multi-Agent DDPG found a centralized way to put other agents's actions into consideration in critic.
Recently, OpenAI rleased a method which extends traditional DDPG method to multi-agent domain \cite{4}. As we know, single-agent algorithm failed because while agent is updating policy, the environment becomes non-stationary which turns out to failure of convergence. \textbf{Multi-Agent DDPG} found a centralized way to put other agents's actions into consideration in critic.
\begin{equation}
L(\theta_i) = E_{s,a,r,s'}[(Q^*(s, a_1, a_2 ... a_n) - y)^2],
\end{equation}
$$where\ y = r_i + \gamma{Q_i}^*(s', a_1', a_2' ... a_n') | _{a_j'=\mu_j'(o_j)}$$
This is a great iead to learn using centralized critic and act using decentralized actor.
[Explain on why env becomes stationary?]
However, MADDPG adopted the same network from DDPG, due the multiple agent number, the learning becomes slow for heavy computation. In the meantime, it is difficult to choose a good learning rate.

Based on the centralized critic idea from MADDPG, we introduce our two approaches towards multi-agent task in Predator-Prey environment.


\subsection{Distributed Method based on action-value function as appro}
As we mentioned in 2.6, we got inspiration from MADDPG which is a multi-agent extension of DDPG. we could extend this centralized thought to critic using advantage (TD-error).
\begin{equation}
A(s_t, a_t; \theta, \theta_v) = \sum_{i=0}^{k-1}(\gamma^i r_{t+i} + \gamma^k V(s_{t+k};\theta_v)) - V(s_t; \theta_v).
Expand All @@ -202,15 +204,30 @@ \subsection{Distributed Multi-Agent Cooperation Architecture}
There is brain and several workers in our architecture. Fu We use several workers to work in parallel world to collect batch data with low connectivity. In each worker, we use brain's policy to choose action, then we give this action (using some noise) to environment and get the batch data (state, actions, rewards, done, next state). And we store this data in to a exchange memory which could accessed by workers (working in each thread). When we have stored batch up to a predifined size, workers stop collecting data from each environment. Meantime, our brain starts to update network using batch data.

\begin{algorithm*}
\caption{Distributed Multi-Agent Cooperation Algorithm}
\caption{Distributed Worker on each thread Algorithm}
\begin{algorithmic}
\State {Initialize a random process $\mathbb{N}$ for action exploration}
\State {Receive initial state $s$}
\For {$t = 1$ to max-episode-length}
\State {for each agent $i$, select action $a_i = \pi_i(o_i) + \mathbb{N}_t$ w.r.t the current policy and exploration}
\State {Execute actions $a = (a_1 ... a_n)$ and observe reward $r = (r_1 ... r_n)$ for each agent and new state $s'$}
\If {Queue size is not equal to max-batch-size}
\State {Store $(s, a, r, s')$ in Queue which could be accessed by other thread}
\EndIf
\EndFor
\end{algorithmic}
\end{algorithm*}

\begin{algorithm*}
\caption{Distributed Multi-Agent Cooperation Algorithm based action-value function}
\begin{algorithmic}
\State {Initiate brain and n workers}
\State {}
\State $New Graph (abstraction level+1)$
\State $Define a queue$
\State {$Initiate\ queue\ with\ head\ node\ pointer\ of\ graph$}
\State $Initiate brain and n workers for n agents$
\State $$
\For {$episode = 1 to M$}
\State {$Do something$}
\State {$$}
\For {$t = 1$ to max-episode-length}
\State {Get batch data $(s, a, r, s')$ from Queue}
\EndFor
\EndFor
\While {$queue \neq empty$}
\State $node1 \gets queue.pop()$
Expand Down

0 comments on commit 45ba629

Please sign in to comment.