ICM Working

BoogaQ · Aug 5, 2020 · 0c08217 · 0c08217
1 parent bbfddac
commit 0c08217
Show file tree

Hide file tree

Showing 12 changed files with 437 additions and 976 deletions.
diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb
@@ -2,7 +2,6 @@
  "cells": [
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
@@ -11,16 +10,7 @@
     "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
     "\n",
     "from models import *\n",
-    "from ppo import PPO, PPO_RND\n",
-=======
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from models import *\n",
-    "from ppo import PPO\n",
->>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
-    "from monitor import Monitor\n",
+    "from ppo import PPO, PPO_RND, PPO_ICM\n",
     "from buffer import RolloutStorage\n",
     "from env import *\n",
     "\n",
@@ -30,11 +20,9 @@
     "import torch.optim\n",
     "import numpy as np\n",
     "\n",
-<<<<<<< HEAD
     "import gym\n",
     "from stable_baselines3.common.cmd_util import make_vec_env\n",
-    "\n",
-    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
+    "from stable_baselines3.common.vec_env import vec_normalize\n"
    ]
   },
   {
@@ -45,65 +33,68 @@
    "source": [
     "import pybulletgym\n",
     "env = make_vec_env(\"CartPole-v0\", 1)\n",
+    "env = VecNormalize(env)\n",
     "model = PPO(env = env, lr = 0.0003, nstep = 256, batch_size = 256)\n",
     "model.learn(total_timesteps = 1e+7, log_interval = 10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "env = gym.make(\"InvertedDoublePendulumPyBulletEnv-v0\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "env.action_space.shape[0]"
-=======
-    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
-    "\n"
->>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-<<<<<<< HEAD
-    "mlp = MlpContinuous(env.observation_space.shape[0], env.action_space.shape[0])"
-=======
-    "env = make_env(\"BreakoutNoFrameskip-v4\", 10)"
->>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-<<<<<<< HEAD
-    "obs = torch.Tensor(env.reset())\n",
-    "mlp.act(obs)"
+    "import pybulletgym\n",
+    "env = make_vec_env(\"MountainCar-v0\", 4, vec_env_cls = SubprocVecEnv)\n",
+    "#env = VecNormalize(env)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------\n",
+      "| Progress                | 0.05%     |\n",
+      "| rollout/                |           |\n",
+      "|    ep_len_mean          | 200       |\n",
+      "|    ep_rew_mean          | -200      |\n",
+      "|    num_episodes         | 24        |\n",
+      "| time/                   |           |\n",
+      "|    total timesteps      | 5120      |\n",
+      "|    total_time           | 3.43      |\n",
+      "| train/                  |           |\n",
+      "|    entropy_loss         | -1.09     |\n",
+      "|    icm_loss             | 2.05      |\n",
+      "|    policy_gradient_loss | -9.72e-07 |\n",
+      "|    total_loss           | 51.5      |\n",
+      "|    value_loss           | 49.5      |\n",
+      "---------------------------------------\n",
+      "--------------------------------------\n",
+      "| Progress                | 0.1%     |\n",
+      "| rollout/                |          |\n",
+      "|    ep_len_mean          | 200      |\n",
+      "|    ep_rew_mean          | -200     |\n",
+      "|    num_episodes         | 48       |\n",
+      "| time/                   |          |\n",
+      "|    total timesteps      | 10240    |\n",
+      "|    total_time           | 6.97     |\n",
+      "| train/                  |          |\n",
+      "|    entropy_loss         | -1.09    |\n",
+      "|    icm_loss             | 2.07     |\n",
+      "|    policy_gradient_loss | 0.000166 |\n",
+      "|    total_loss           | 41.3     |\n",
+      "|    value_loss           | 39.2     |\n",
+      "--------------------------------------\n"
+     ]
+    }
+   ],
    "source": [
-    "import pybulletgym\n",
-    "env = make_vec_env(\"MountainCar-v0\", 4, vec_env_cls = SubprocVecEnv)\n",
-    "model = PPO_RND(env = env, lr = 0.03, nstep = 64, batch_size = 64, max_grad_norm = 0.1, hidden_size = 16)\n",
+    "model = PPO_ICM(env = env, lr = 0.0003, nstep = 128, batch_size = 128, max_grad_norm = 0.5, hidden_size = 32, icm_hidden_size = 32)\n",
     "model.learn(total_timesteps = 1e+7, log_interval = 10)"
    ]
   },
@@ -113,12 +104,6 @@
    "metadata": {},
    "outputs": [],
    "source": []
-=======
-    "\n",
-    "algorithm = PPO(env = env)\n",
-    "algorithm.learn(1e+06, log_interval = 10)"
-   ]
->>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
   }
  ],
  "metadata": {

diff --git a/Untitled.ipynb b/Untitled.ipynb
diff --git a/__pycache__/buffer.cpython-37.pyc b/__pycache__/buffer.cpython-37.pyc
diff --git a/__pycache__/env.cpython-37.pyc b/__pycache__/env.cpython-37.pyc
diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc
diff --git a/__pycache__/ppo.cpython-37.pyc b/__pycache__/ppo.cpython-37.pyc
diff --git a/__pycache__/util.cpython-37.pyc b/__pycache__/util.cpython-37.pyc
diff --git a/buffer.py b/buffer.py
@@ -125,7 +125,7 @@ def __init__(self, buffer_size, n_envs, obs_space, action_space, gae_lam = 0.95,
         self.gae_lam = gae_lam
         self.gamma = gamma
 
-        self.observations, self.actions, self.rewards, self.values = None, None, None, None
+        self.observations, self.actions, self.rewards, self.values, self.int_rewards = None, None, None, None, None
         self.returns, self.action_log_probs, self.masks, self.advantages = None, None, None, None  
         self.generator_ready = False
 
@@ -140,6 +140,7 @@ def reset(self):
         self.observations =     np.zeros((self.buffer_size, self.n_envs, *self.obs_shape), dtype = 'float32')
         self.actions =          np.zeros((self.buffer_size, self.n_envs))
         self.rewards =          np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
+        self.int_rewards =      np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
         self.values =           np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
         self.returns =          np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
         self.action_log_probs = np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
@@ -148,7 +149,7 @@ def reset(self):
         self.generator_ready = False
         super(RolloutStorage, self).reset()   
 
-    def add(self, obs, action, reward, value, mask, log_prob):
+    def add(self, obs, action, reward, value, mask, log_prob, int_reward = 0):
         """
         :param obs: (np.Tensor) Observation
         :param action: (np.Tensor) Action
@@ -161,6 +162,7 @@ def add(self, obs, action, reward, value, mask, log_prob):
         self.observations[self.pos] =       np.array(obs).copy()
         self.actions[self.pos] =            np.array(action).copy()
         self.rewards[self.pos] =            np.array(reward).copy()
+        self.int_rewards[self.pos] =        np.array(int_reward).copy()
         self.masks[self.pos] =              np.array(mask).copy()
         self.values[self.pos] =             value.clone().cpu().numpy()
         self.action_log_probs[self.pos] =   log_prob.clone().cpu().numpy()
@@ -187,6 +189,40 @@ def compute_returns_and_advantages(self, last_value, dones):
         last_value = last_value.clone().cpu().numpy().flatten()
         last_gae_lam = 0
 
+        self.int_rewards = (self.int_rewards - np.mean(self.int_rewards)) / (np.std(self.int_rewards) + 1e-8)
+
+        # Normalizing the rewards:
+        #self.rewards = (self.rewards - np.mean(self.rewards)) / (np.std(self.rewards) + 1e-5)
+
+        for step in reversed(range(self.buffer_size)):
+            if step == self.buffer_size - 1:
+                next_non_terminal = 1.0 - dones
+                next_value = last_value
+            else:
+                next_non_terminal = 1.0 - self.masks[step + 1]
+                next_value = self.values[step + 1]
+            delta = self.rewards[step] + self.int_rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
+            last_gae_lam = delta + self.gamma * self.gae_lam * next_non_terminal * last_gae_lam
+            self.advantages[step] = last_gae_lam
+        self.returns = self.advantages + self.values
+
+    def compute_intrinsicreturns_and_advantages(self, last_value, dones, int_rewards):
+        """
+        Post-processing step: compute the returns (sum of discounted rewards)
+        and GAE advantage.
+        Adapted from Stable-Baselines PPO2.
+        Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
+        to compute the advantage. To obtain vanilla advantage (A(s) = R - V(S))
+        where R is the discounted reward with value bootstrap,
+        set ``gae_lambda=1.0`` during initialization.
+
+        :param last_value: (th.Tensor)
+        :param dones: (np.ndarray)
+        """
+
+        last_value = last_value.clone().cpu().numpy().flatten()
+        last_gae_lam = 0
+
         # Normalizing the rewards:
         #self.rewards = (self.rewards - np.mean(self.rewards)) / (np.std(self.rewards) + 1e-5)