diff --git a/mpirun_test.py b/mpirun_test.py index c3a2702..0a744ea 100644 --- a/mpirun_test.py +++ b/mpirun_test.py @@ -57,12 +57,14 @@ def test(args): exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i]))) for i in range(n): actors[i].mainModel.load_weights(args["modelFolder"] + str(i)+'_weights'+'.h5') #"ep200/" + + # s = env.reset() + # s = env.reset() for ep in range(10): s = env.reset() reward = 0.0 for step in range(200): - time.sleep(0.01) env.render() + # time.sleep(10) actions = [] for i in range(env.n): state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0])) @@ -101,7 +103,7 @@ def test(args): parser.add_argument('--use-gym-monitor', help='record gym results', action='store_true') parser.add_argument('--monitor-dir', help='directory for storing gym results', default='./results/videos/video1') parser.add_argument('--summary-dir', help='directory for storing tensorboard info', default='./results/2vs1_dis_prioritizedBatch/tfdata/') - parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/2vs1_dis_prioritizedBatch/weights_critic_worker/") #2vs1_dis_prioritizedBatch/weights_critic_worker/ 2vs1_maddpg_tanh/weights_prioritized/ + parser.add_argument('--modelFolder', help='the folder which saved model data', default="./results/2vs1_dis_prioritizedBatch/weights_maddpg/") #2vs1_dis_prioritizedBatch/weights_critic_worker/ 2vs1_maddpg_tanh/weights_prioritized/ parser.add_argument('--runTest', help='use saved model to run', default=False) parser.add_argument('--work-max-step', help='work_max_step', default=50) parser.add_argument('--m-size', help='M size', default=128) diff --git a/multiagent-envs/multiagent/rendering.py b/multiagent-envs/multiagent/rendering.py index 234d818..dce480e 100644 --- a/multiagent-envs/multiagent/rendering.py +++ b/multiagent-envs/multiagent/rendering.py @@ -236,12 +236,14 @@ def render1(self): glVertex3f(p[0], p[1],0) # draw each vertex glEnd() - color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) - glColor4f(*color) - glBegin(GL_LINE_LOOP) - for p in self.v: - glVertex3f(p[0], p[1],0) # draw each vertex - glEnd() + if len(self.v) != 4 : + color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) + glColor4f(*color) + glBegin(GL_LINE_LOOP) + for p in self.v: + glVertex3f(p[0], p[1],0) # draw each vertex + glEnd() + # print("render1") def make_circle(radius=10, res=30, filled=True): diff --git a/thesis_related/thesis.tex b/thesis_related/thesis.tex index 3a884c6..dc231fb 100644 --- a/thesis_related/thesis.tex +++ b/thesis_related/thesis.tex @@ -41,7 +41,7 @@ Research of Moving Target Search is ongoing recent years. There are some researches according to the problem of Multi-Agent pursuing a moving target, but few of them could be applied to real world. This research focused on this problem and proposed a speed-up method for real-time grid environment based on Cover Heuristic method. We use Map Abstraction to build map hierarchy which helps us do faster map search compared to previous method. Refinement is used for highly-abstracted route refining to successor original map. Finally, evaluation experiments are based on Benchmark maps and the result showed high efficiency of our proposed method. } -\keywords{keyword-1, keyword-2, keyword-3, keyword-4, keyword-5, keyword-6} +\keywords{Multi-Agent, Reinforcement Learning, Deep Learning, Distribute, Replay Memory} \begin{document} @@ -52,7 +52,7 @@ \section{Introduction} \begin{figure}[t] \begin{center} - \includegraphics[width=5cm]{imgs/adversary_chasing.png} + \includegraphics[width=6cm]{imgs/maddpg1.PNG} \caption{Predator-Prey(CHANGE)} \label{fig:adversaryChasing} \end{center} @@ -63,7 +63,6 @@ \section{Introduction} One representative for multi-agent task is Predator-Prey\cite{maddpg}, showed in Fg.\ref{fig:adversaryChasing}. In this case, there are 3 predators, 1 prey and 2 landmarks (obstacles) in this map. Predators move with slower speed to chase the faster moving prey. For human being, the cooperation strategy of splitting up and surrounding is absolutely easy to understand and learn. Unfortunately, it is difficult for agent to learn. Although Traditional reinforcement learning such as Q-learning\cite{qlearning}, Policy Gradient\cite{pg} performs well and even better than human being in Atari Game\cite{ddpg}, it performs poorly in multi-agent domain. The reason why the successful RL methods using in single-agent domains could not acquire the same result in multi-agent domain is that along with mult-agent self-learning, the environment becomes non-stationary which force learning fail to convergence. \par -(CHANGE) We have two problems in multi-agent task, one is that traditional RL methods can't solve multi-agent task because environment becomes non-stationary during learning, the second one is that random sampling batch data from experience replay may not be effcient enough for learning. In this work, we first introduce several prior works and related researches and explain why they failed in multi-agent domain. Then we will explain our proposed method - Distributed Multi-Agent Cooperation Algorithm besed on MADDPG algorithm\cite{maddpg} using prioritized batch data in solving Predator-Prey task. Experiments shows we achieve xx\% faster and xx times rewards comparing to MADDPG and DDPG.\par \section{Background} @@ -253,7 +252,16 @@ \subsection{Distributed Multi-Agent architecture} Second, multiple workers running in parallel with different random seed is likely to explore different situations of the same environment. Moreover, exploration noise in different parallel world could attribute to the exploration's diversity. \par \subsection{Prioritized Batch Data} - +\subsection{Actor-Critic Method} +\begin{figure}[h] + \begin{center} + \includegraphics[width=8cm]{imgs/max_loss.PNG} + \caption{ + Batch selection with max loss. + } + \label{fig:max_loss} + \end{center} +\end{figure} Experience replay addresses the following issues: it helps to break the temporal correlations by sampling from big fixed-size memory. What measures batch data as good one or bad one it how much it could lead to a better single update. Temporal-Difference error (TD error) used in DDPG is the difference between target network Q value and evalution network Q value. The bigger TD error is, the better this update is. \par To select good batch data for updating, we could firstly sample a $M$ (bigger size) batches, we plan to select $N$ (smaller size) batches for update. We divide $M$ size batches into $M/N$ size parts, we calculate each part's loss. We choose the part of batches with biggest loss to train. We call these good batch data as Prioritized Batch Data. \par @@ -264,8 +272,16 @@ \section{Experiments} In this section, we will introduce the experiment environment we use and several experiments we carried to verify the superiority of our proposed method. \subsection{Experiment Environment} + To perform our experiments, we adopt the multiagent-particle-envs used in \cite{maddpg}, which consists of $N$ agents and $L$ landmarks inhabiting a two-dimensional world with continunous observation space and continunous action space. There are several types of environment it provides with. We focus on multi-agent cooperation for chasing target, so we adopt Predator-Prey environment.\par In this Predator-Prey environment, $N$ slower cooperating agents try to chase the faster target which could flee away from chasers around a randomly generated environment with $L$ large landmarks served as obstacles to blcok the way. Each time agents collide with a target, the agens are rewarded while the target is penalized. \par +\begin{figure}[t] + \begin{center} + \includegraphics[width=6cm]{imgs/maddpg1.PNG} + \caption{Predator-Prey(CHANGE)} + \label{fig:adversaryChasing} + \end{center} +\end{figure} Due to being short of calculation capability and resources, we add some constrains in Predator-Prey environment. \begin{itemize} \item $N$ Predators, $\in$ $[2, 4]$ with random initial position.