Skip to content

Commit

Permalink
ICM Working
Browse files Browse the repository at this point in the history
  • Loading branch information
BoogaQ committed Aug 5, 2020
1 parent bbfddac commit 0c08217
Show file tree
Hide file tree
Showing 12 changed files with 437 additions and 976 deletions.
111 changes: 48 additions & 63 deletions .ipynb_checkpoints/main-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"cells": [
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 1,
"metadata": {},
"outputs": [],
Expand All @@ -11,16 +10,7 @@
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"\n",
"from models import *\n",
"from ppo import PPO, PPO_RND\n",
=======
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from models import *\n",
"from ppo import PPO\n",
>>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
"from monitor import Monitor\n",
"from ppo import PPO, PPO_RND, PPO_ICM\n",
"from buffer import RolloutStorage\n",
"from env import *\n",
"\n",
Expand All @@ -30,11 +20,9 @@
"import torch.optim\n",
"import numpy as np\n",
"\n",
<<<<<<< HEAD
"import gym\n",
"from stable_baselines3.common.cmd_util import make_vec_env\n",
"\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
"from stable_baselines3.common.vec_env import vec_normalize\n"
]
},
{
Expand All @@ -45,65 +33,68 @@
"source": [
"import pybulletgym\n",
"env = make_vec_env(\"CartPole-v0\", 1)\n",
"env = VecNormalize(env)\n",
"model = PPO(env = env, lr = 0.0003, nstep = 256, batch_size = 256)\n",
"model.learn(total_timesteps = 1e+7, log_interval = 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env = gym.make(\"InvertedDoublePendulumPyBulletEnv-v0\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env.action_space.shape[0]"
=======
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"\n"
>>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"mlp = MlpContinuous(env.observation_space.shape[0], env.action_space.shape[0])"
=======
"env = make_env(\"BreakoutNoFrameskip-v4\", 10)"
>>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"obs = torch.Tensor(env.reset())\n",
"mlp.act(obs)"
"import pybulletgym\n",
"env = make_vec_env(\"MountainCar-v0\", 4, vec_env_cls = SubprocVecEnv)\n",
"#env = VecNormalize(env)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---------------------------------------\n",
"| Progress | 0.05% |\n",
"| rollout/ | |\n",
"| ep_len_mean | 200 |\n",
"| ep_rew_mean | -200 |\n",
"| num_episodes | 24 |\n",
"| time/ | |\n",
"| total timesteps | 5120 |\n",
"| total_time | 3.43 |\n",
"| train/ | |\n",
"| entropy_loss | -1.09 |\n",
"| icm_loss | 2.05 |\n",
"| policy_gradient_loss | -9.72e-07 |\n",
"| total_loss | 51.5 |\n",
"| value_loss | 49.5 |\n",
"---------------------------------------\n",
"--------------------------------------\n",
"| Progress | 0.1% |\n",
"| rollout/ | |\n",
"| ep_len_mean | 200 |\n",
"| ep_rew_mean | -200 |\n",
"| num_episodes | 48 |\n",
"| time/ | |\n",
"| total timesteps | 10240 |\n",
"| total_time | 6.97 |\n",
"| train/ | |\n",
"| entropy_loss | -1.09 |\n",
"| icm_loss | 2.07 |\n",
"| policy_gradient_loss | 0.000166 |\n",
"| total_loss | 41.3 |\n",
"| value_loss | 39.2 |\n",
"--------------------------------------\n"
]
}
],
"source": [
"import pybulletgym\n",
"env = make_vec_env(\"MountainCar-v0\", 4, vec_env_cls = SubprocVecEnv)\n",
"model = PPO_RND(env = env, lr = 0.03, nstep = 64, batch_size = 64, max_grad_norm = 0.1, hidden_size = 16)\n",
"model = PPO_ICM(env = env, lr = 0.0003, nstep = 128, batch_size = 128, max_grad_norm = 0.5, hidden_size = 32, icm_hidden_size = 32)\n",
"model.learn(total_timesteps = 1e+7, log_interval = 10)"
]
},
Expand All @@ -113,12 +104,6 @@
"metadata": {},
"outputs": [],
"source": []
=======
"\n",
"algorithm = PPO(env = env)\n",
"algorithm.learn(1e+06, log_interval = 10)"
]
>>>>>>> 28e87e7e8b00547797b0dd0409e45f3b4e11af12
}
],
"metadata": {
Expand Down
43 changes: 0 additions & 43 deletions Untitled.ipynb

This file was deleted.

Binary file modified __pycache__/buffer.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/env.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/models.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/ppo.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/util.cpython-37.pyc
Binary file not shown.
40 changes: 38 additions & 2 deletions buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def __init__(self, buffer_size, n_envs, obs_space, action_space, gae_lam = 0.95,
self.gae_lam = gae_lam
self.gamma = gamma

self.observations, self.actions, self.rewards, self.values = None, None, None, None
self.observations, self.actions, self.rewards, self.values, self.int_rewards = None, None, None, None, None
self.returns, self.action_log_probs, self.masks, self.advantages = None, None, None, None
self.generator_ready = False

Expand All @@ -140,6 +140,7 @@ def reset(self):
self.observations = np.zeros((self.buffer_size, self.n_envs, *self.obs_shape), dtype = 'float32')
self.actions = np.zeros((self.buffer_size, self.n_envs))
self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
self.int_rewards = np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
self.values = np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
self.returns = np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
self.action_log_probs = np.zeros((self.buffer_size, self.n_envs), dtype = 'float32')
Expand All @@ -148,7 +149,7 @@ def reset(self):
self.generator_ready = False
super(RolloutStorage, self).reset()

def add(self, obs, action, reward, value, mask, log_prob):
def add(self, obs, action, reward, value, mask, log_prob, int_reward = 0):
"""
:param obs: (np.Tensor) Observation
:param action: (np.Tensor) Action
Expand All @@ -161,6 +162,7 @@ def add(self, obs, action, reward, value, mask, log_prob):
self.observations[self.pos] = np.array(obs).copy()
self.actions[self.pos] = np.array(action).copy()
self.rewards[self.pos] = np.array(reward).copy()
self.int_rewards[self.pos] = np.array(int_reward).copy()
self.masks[self.pos] = np.array(mask).copy()
self.values[self.pos] = value.clone().cpu().numpy()
self.action_log_probs[self.pos] = log_prob.clone().cpu().numpy()
Expand All @@ -187,6 +189,40 @@ def compute_returns_and_advantages(self, last_value, dones):
last_value = last_value.clone().cpu().numpy().flatten()
last_gae_lam = 0

self.int_rewards = (self.int_rewards - np.mean(self.int_rewards)) / (np.std(self.int_rewards) + 1e-8)

# Normalizing the rewards:
#self.rewards = (self.rewards - np.mean(self.rewards)) / (np.std(self.rewards) + 1e-5)

for step in reversed(range(self.buffer_size)):
if step == self.buffer_size - 1:
next_non_terminal = 1.0 - dones
next_value = last_value
else:
next_non_terminal = 1.0 - self.masks[step + 1]
next_value = self.values[step + 1]
delta = self.rewards[step] + self.int_rewards[step] + self.gamma * next_value * next_non_terminal - self.values[step]
last_gae_lam = delta + self.gamma * self.gae_lam * next_non_terminal * last_gae_lam
self.advantages[step] = last_gae_lam
self.returns = self.advantages + self.values

def compute_intrinsicreturns_and_advantages(self, last_value, dones, int_rewards):
"""
Post-processing step: compute the returns (sum of discounted rewards)
and GAE advantage.
Adapted from Stable-Baselines PPO2.
Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
to compute the advantage. To obtain vanilla advantage (A(s) = R - V(S))
where R is the discounted reward with value bootstrap,
set ``gae_lambda=1.0`` during initialization.
:param last_value: (th.Tensor)
:param dones: (np.ndarray)
"""

last_value = last_value.clone().cpu().numpy().flatten()
last_gae_lam = 0

# Normalizing the rewards:
#self.rewards = (self.rewards - np.mean(self.rewards)) / (np.std(self.rewards) + 1e-5)

Expand Down
Loading

0 comments on commit 0c08217

Please sign in to comment.