From c694068fca54daf2a83110560334762968baa2a7 Mon Sep 17 00:00:00 2001 From: saleml Date: Thu, 30 Aug 2018 05:51:47 -0400 Subject: [PATCH] Auxiliary supervised losses (#159) * fix issue with loading demos when making demos * reindentation + small bug fix * - Added head for extra binary information that can be used for an auxiliary supervised loss - PEP8 reformatting * Line too long - changed * Modified how experiences are collected when there are extra binary information to be used from the environment * A few comments that were super helpful to me. I guess this commit doesn't need to be in the PR * Taking into account the extra binary information to evaluate a supervised loss * Small modifications to the model to output extra_logits (no need for sigmoid layer) * - Added the possibility of specifying how many extra binary information to use from the environment (they have to specified in the `info` part of the gym step function). - Logging of the corresponding supervised loss and accuracy * clearer help for an argparse argument * The environment yields, at each step, if the new state is already visited or not * supervised loss coeff can be a float, and not necessarily an int * fix a bug at evaluation time due to extra outputs of the model when there is an auxiliary loss * quick hack to show the supervised loss coefficient in the model name for easier comparison * - Reseeding after initializing the model to make sure to get consistent results. - This commit doesn't need to be in the PR. * typo * Defining the extra info head after the actor and the critic so that the initialization process makes the results consistent between when we're not using extra info and when we're using it with a supervised loss coef of 0. * Log total loss * small bug fix * typo * added logging of prevalence in the supervised auxiliary task for debugging/understanding * default extra binary info to False for retro compatibility * added more binary info * - fixed bug in enjoy - made enjoy and evaluate compatible with the extra binary info setting * - reuse previous deleted normalization of weights - define as many extra_heads as passed to the model through a dictionary - define an extra_predictions dictionary to be returned - always return a dictionary in the forward model to avoid too many conditionals in scripts that use the model * use extra-info as a list argument containing the names of the extra info wanted from the environment * Because acmodel.forward returns a dictionary. This means that at each call, we should change the containing variable * return extra information at each step * - collect the experiments the right way - update the parameters in the presence of extra info for supervised aux tasks - adequate logging - change of model * change-list * Use ModuleDict instead of dict. !! REQUIRES pytorch 0.4.1 !! * Add a new aux loss - requires a small change in minigrid * reintroduce the prevalences * stop using numbers to check for presence of objects in observation * factorization * docstring * removed unnecessary argument from ModelAgent * fix bug introduced in bf10a286a89f8e15ee46d7cb7d41f374601dda28 * fix logging issue * - add a conditional in evaluation of grad norm because sometimes we use different model origins - add the option of using a pre-trained model and have the fine-tuned version of it saved elsewhere (otherwise, one cannot use the same pre-trained model to finetune 2 different models in parallel) * allows using extra heads even for pre-trained models * fix small bug with 'continuous' type * fix small bug with 'continuous' type * change comment - again, doesn't have to be merged * use a new class instead of overloading the ppo file * model refactorization * more refactorization * update requirement of pytorch version * add some comments for the classes introduced * bug fix * - Use a wrapper for supervised auxiliary losses - added binary info: does the agent think they did the same action as would the bot - bugfix in rl/utils/supervised_losses * comment on function * move function to wrapper * - Use different wrappers for each auxiliary task - Rename extra info to aux info * rename extra info to aux info * revert file commited by mistake * remove float() when dealing with binary information * rename args --- README.md | 2 +- babyai/algos/imitation.py | 8 +- babyai/levels/supervised_losses.py | 195 +++++++++++++++++++++++++++ babyai/model.py | 85 +++++++++--- babyai/rl/algos/a2c.py | 5 +- babyai/rl/algos/base.py | 31 ++++- babyai/rl/algos/ppo.py | 60 ++++++++- babyai/rl/utils/supervised_losses.py | 177 ++++++++++++++++++++++++ babyai/utils/agent.py | 5 +- environment.yaml | 2 +- scripts/enjoy.py | 1 - scripts/make_agent_demos.py | 4 +- scripts/train_rl.py | 50 ++++++- setup.py | 2 +- 14 files changed, 585 insertions(+), 42 deletions(-) create mode 100644 babyai/levels/supervised_losses.py create mode 100644 babyai/rl/utils/supervised_losses.py diff --git a/README.md b/README.md index 773b32c0..7737dc5b 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Requirements: - OpenAI gym - NumPy - PyQT5 -- PyTorch 0.4+ +- PyTorch 0.4.1+ Start by manually installing PyTorch. See the [PyTorch website](http://pytorch.org/) for installation instructions specific to your platform. diff --git a/babyai/algos/imitation.py b/babyai/algos/imitation.py index 2230b328..a1ef3fe7 100644 --- a/babyai/algos/imitation.py +++ b/babyai/algos/imitation.py @@ -160,7 +160,7 @@ def run_epoch_recurrence_one_batch(self, batch, is_training=False): preprocessed_obs = self.obss_preprocessor(obs, device=self.device) with torch.no_grad(): # taking the memory till the length of time_step_inds, as demos beyond that have already finished - _, _, new_memory = self.acmodel(preprocessed_obs, memory[:len(inds), :]) + new_memory = self.acmodel(preprocessed_obs, memory[:len(inds), :])['memory'] for i in range(len(inds)): # Copying to the memories at the corresponding locations @@ -189,7 +189,11 @@ def run_epoch_recurrence_one_batch(self, batch, is_training=False): preprocessed_obs = self.obss_preprocessor(obs, device=self.device) action_step = action_true[indexes] mask_step = mask[indexes] - dist, value, memory = self.acmodel(preprocessed_obs, memory * mask_step) + model_results = self.acmodel(preprocessed_obs, memory * mask_step) + dist = model_results['dist'] + value = model_results['value'] + memory = model_results['memory'] + entropy = dist.entropy().mean() policy_loss = -dist.log_prob(action_step).mean() loss = policy_loss - self.args.entropy_coef * entropy diff --git a/babyai/levels/supervised_losses.py b/babyai/levels/supervised_losses.py new file mode 100644 index 00000000..b7fc19c6 --- /dev/null +++ b/babyai/levels/supervised_losses.py @@ -0,0 +1,195 @@ +import gym +from babyai.agents.bot import Bot +from gym_minigrid.minigrid import OBJECT_TO_IDX, Grid +from .verifier import * + + +def wrap_env(env, aux_info): + ''' + helper function that callss the defined wrappers depending on the what information is required + ''' + if 'seen_state' in aux_info: + env = SeenStateWrapper(env) + if 'visit_proportion' in aux_info: + env = VisitProportionWrapper(env) + if 'see_door' in aux_info: + env = SeeDoorWrapper(env) + if 'see_obj' in aux_info: + env = SeeObjWrapper(env) + if 'in_front_of_what' in aux_info: + env = InForntOfWhatWrapper(env) + if 'obj_in_instr' in aux_info: + env = ObjInInstrWrapper(env) + if 'bot_action' in aux_info: + env = BotActionWrapper(env) + return env + + +class SeenStateWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to whether + the new state is already visited or not + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + + # Define a set of seen states. A state is represent by a tuple ((x, y), direction) + self.seen_states = set() + + # Append the current state to the seen states + # The state is defined in the reset function of the MiniGridEnv class + self.seen_states.add((tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir)) + + return obs + + def step(self, action): + obs, reward, done, info = self.env.step(action) + + if (tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir) in self.seen_states: + seen_state = True + else: + self.seen_states.add((tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir)) + seen_state = False + + info['seen_state'] = seen_state + + return obs, reward, done, info + + +class VisitProportionWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to the number of times + the new state has been visited before, divided by the total number of steps + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + + # Define a dict of seen states and number of times seen. A state is represent by a tuple ((x, y), direction) + self.seen_states_dict = dict() + + # Append the current state to the seen states + # The state is defined in the reset function of the MiniGridEnv class + self.seen_states_dict[(tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir)] = 1 + + # Instantiate a counter of total steps + self.total_steps = 0 + + return obs + + def step(self, action): + obs, reward, done, info = self.env.step(action) + + self.total_steps += 1 + if (tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir) in self.seen_states_dict: + self.seen_states_dict[(tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir)] += 1 + else: + self.seen_states_dict[(tuple(self.env.unwrapped.agent_pos), self.env.unwrapped.agent_dir)] = 1 + + info['visit_proportion'] = ((self.seen_states_dict[(tuple(self.env.unwrapped.agent_pos), + self.env.unwrapped.agent_dir)] + - 1) / self.total_steps) + + return obs, reward, done, info + + +class SeeDoorWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to whether + the current observation contains a door, locked or not + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + return obs + + def step(self, action): + obs, reward, done, info = self.env.step(action) + info['see_door'] = (None, 'door') in Grid.decode(obs['image']) + return obs, reward, done, info + + +class SeeObjWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to whether + the current observation contains a key, ball, or box + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + return obs + + def step(self, action): + obs, reward, done, info = self.env.step(action) + info['see_obj'] = any([obj in Grid.decode(obs['image']) for obj in + ((None, 'key'), (None, 'ball'), (None, 'box')) + ]) + return obs, reward, done, info + + +class InForntOfWhatWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to which of + empty cell/wall/door/key/box/ball is in the cell right in front of the agent + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + return obs + + def step(self, action): + obs, reward, done, info = self.env.step(action) + cell_in_front = self.env.unwrapped.grid.get(*self.env.unwrapped.front_pos) + info['in_front_of_what'] = OBJECT_TO_IDX[cell_in_front.type] if cell_in_front else 0 # int 0--8 + return obs, reward, done, info + + +class ObjInInstrWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to whether an object + described in the instruction appears in the current observation + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + return obs + + def obj_in_mission(self, instr): + if isinstance(instr, PutNextInstr): + return [(instr.desc_fixed.color, instr.desc_fixed.type), + (instr.desc_move.color, instr.desc_move.type)] + if isinstance(instr, SeqInstr): + return self.obj_in_mission(instr.instr_a) + self.obj_in_mission(instr.instr_b) + else: + return [(instr.desc.color, instr.desc.type)] + + def step(self, action): + obs, reward, done, info = self.env.step(action) + info['obj_in_instr'] = any([obj in Grid.decode(obs['image']) + for obj in self.obj_in_mission(self.env.unwrapped.instrs)]) + return obs, reward, done, info + + +class BotActionWrapper(gym.Wrapper): + ''' + Wrapper that adds an entry to the info dic of the step function's output that corresponds to whether + the action taken corresponds to the action the GOFAI bot would have taken + ''' + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + self.expert = Bot(self.env.unwrapped) + return obs + + def step(self, action): + obs, reward, done, info = self.env.step(action) + + try: + expert_action = self.expert.step() + except: + expert_action = None + + info['bot_action'] = action == expert_action + + return obs, reward, done, info diff --git a/babyai/model.py b/babyai/model.py index 92fc39df..d31ac793 100644 --- a/babyai/model.py +++ b/babyai/model.py @@ -5,6 +5,8 @@ from torch.distributions.categorical import Categorical from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence import babyai.rl +from babyai.rl.utils.supervised_losses import required_heads + # Function from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py def initialize_parameters(m): @@ -15,15 +17,16 @@ def initialize_parameters(m): if m.bias is not None: m.bias.data.fill_(0) + # Inspired by FiLMedBlock from https://arxiv.org/abs/1709.07871 class AgentControllerFiLM(nn.Module): def __init__(self, in_features, out_features, in_channels, imm_channels): super().__init__() self.conv = nn.Sequential( - nn.Conv2d(in_channels=in_channels, out_channels=imm_channels, kernel_size=(1,1)), + nn.Conv2d(in_channels=in_channels, out_channels=imm_channels, kernel_size=(1, 1)), nn.ReLU(), - nn.Conv2d(in_channels=imm_channels, out_channels=64, kernel_size=(1,1)), + nn.Conv2d(in_channels=imm_channels, out_channels=64, kernel_size=(1, 1)), nn.ReLU() ) self.weight = nn.Linear(in_features, out_features) @@ -34,12 +37,13 @@ def __init__(self, in_features, out_features, in_channels, imm_channels): def forward(self, x, y): return self.conv(x) * self.weight(y).unsqueeze(2).unsqueeze(3) + self.bias(y).unsqueeze(2).unsqueeze(3) + class ExpertControllerFiLM(nn.Module): def __init__(self, in_features, out_features, in_channels, imm_channels): super().__init__() - self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=imm_channels, kernel_size=(3,3), padding=1) + self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=imm_channels, kernel_size=(3, 3), padding=1) self.bn1 = nn.BatchNorm2d(imm_channels) - self.conv2 = nn.Conv2d(in_channels=imm_channels, out_channels=out_features, kernel_size=(3,3), padding=1) + self.conv2 = nn.Conv2d(in_channels=imm_channels, out_channels=out_features, kernel_size=(3, 3), padding=1) self.bn2 = nn.BatchNorm2d(out_features) self.weight = nn.Linear(in_features, out_features) @@ -55,6 +59,7 @@ def forward(self, x, y): out = F.relu(out) return out + class ImageBOWEmbedding(nn.Module): def __init__(self, num_embeddings, embedding_dim, padding_idx=None, reduce_fn=torch.mean): super(ImageBOWEmbedding, self).__init__() @@ -70,9 +75,11 @@ def forward(self, inputs): embeddings = torch.transpose(torch.transpose(embeddings, 1, 3), 2, 3) return embeddings + class ACModel(nn.Module, babyai.rl.RecurrentACModel): def __init__(self, obs_space, action_space, - image_dim=128, memory_dim=128, use_instr=False, lang_model="gru", use_memory=False, arch="cnn1"): + image_dim=128, memory_dim=128, use_instr=False, lang_model="gru", use_memory=False, arch="cnn1", + aux_info=None): super().__init__() # Decide which components are enabled @@ -80,10 +87,10 @@ def __init__(self, obs_space, action_space, self.use_memory = use_memory self.arch = arch self.lang_model = lang_model + self.aux_info = aux_info self.image_dim = image_dim self.memory_dim = memory_dim - self.obs_space = obs_space if arch == "cnn1": @@ -127,13 +134,13 @@ def __init__(self, obs_space, action_space, nn.Conv2d(in_channels=3, out_channels=128, kernel_size=(2, 2), padding=1), nn.BatchNorm2d(128), nn.ReLU(), - nn.MaxPool2d(kernel_size=(2,2), stride=2), + nn.MaxPool2d(kernel_size=(2, 2), stride=2), nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(3, 3), padding=1), nn.BatchNorm2d(128), nn.ReLU(), - nn.MaxPool2d(kernel_size=(2,2), stride=2) + nn.MaxPool2d(kernel_size=(2, 2), stride=2) ) - self.film_pool = nn.MaxPool2d(kernel_size=(2,2), stride=2) + self.film_pool = nn.MaxPool2d(kernel_size=(2, 2), stride=2) elif arch == 'embcnn1': self.image_conv = nn.Sequential( ImageBOWEmbedding(obs_space["image"], embedding_dim=16, padding_idx=0, reduce_fn=torch.mean), @@ -158,10 +165,11 @@ def __init__(self, obs_space, action_space, instr_embedding_size = self.instr_embedding_size if self.lang_model == 'bigru': instr_embedding_size = instr_embedding_size // 2 - self.instr_rnn = nn.GRU(self.word_embedding_size, instr_embedding_size, batch_first=True, bidirectional=(self.lang_model == 'bigru')) + self.instr_rnn = nn.GRU(self.word_embedding_size, instr_embedding_size, batch_first=True, + bidirectional=(self.lang_model == 'bigru')) else: kernel_dim = 64 - kernel_sizes = [3,4] + kernel_sizes = [3, 4] self.instr_convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, self.word_embedding_size)) for K in kernel_sizes]) self.instr_embedding_size = kernel_dim * len(kernel_sizes) @@ -195,7 +203,7 @@ def __init__(self, obs_space, action_space, if arch == "expert_filmcnn": num_module = 2 else: - num_module = int(arch[(arch.rfind('_')+1):]) + num_module = int(arch[(arch.rfind('_') + 1):]) self.controllers = [] for ni in range(num_module): if ni < num_module-1: @@ -226,6 +234,48 @@ def __init__(self, obs_space, action_space, # Initialize parameters correctly self.apply(initialize_parameters) + # Define head for extra info + if self.aux_info: + self.extra_heads = None + self.add_heads() + + def add_heads(self): + ''' + When using auxiliary tasks, the environment yields at each step some binary, continous, or multiclass + information. The agent needs to predict those information. This function add extra heads to the model + that output the predictions. There is a head per extra information (the head type depends on the extra + information type). + ''' + self.extra_heads = nn.ModuleDict() + for info in self.aux_info: + if required_heads[info] == 'binary': + self.extra_heads[info] = nn.Linear(self.embedding_size, 1) + elif required_heads[info].startswith('multiclass'): + n_classes = int(required_heads[info].split('multiclass')[-1]) + self.extra_heads[info] = nn.Linear(self.embedding_size, n_classes) + elif required_heads[info].startswith('continuous'): + if required_heads[info].endswith('01'): + self.extra_heads[info] = nn.Sequential(nn.Linear(self.embedding_size, 1), nn.Sigmoid()) + else: + raise ValueError('Only continous01 is implemented') + else: + raise ValueError('Type not supported') + # initializing these parameters independently is done in order to have consistency of results when using + # supervised-loss-coef = 0 and when not using any extra binary information + self.extra_heads[info].apply(initialize_parameters) + + def add_extra_heads_if_necessary(self, aux_info): + ''' + This function allows using a pre-trained model without aux_info and add aux_info to it and still make + it possible to finetune. + ''' + try: + if not hasattr(self, 'aux_info') or not set(self.aux_info) == set(aux_info): + self.aux_info = aux_info + self.add_heads() + except Exception: + raise ValueError('Could not add extra heads') + @property def memory_size(self): return 2 * self.semi_memory_size @@ -268,13 +318,18 @@ def forward(self, obs, memory): if self.use_instr and self.arch != "filmcnn" and not self.arch.startswith("expert_filmcnn"): embedding = torch.cat((embedding, embed_instr), dim=1) + if hasattr(self, 'aux_info') and self.aux_info: + extra_predictions = {info: self.extra_heads[info](embedding) for info in self.extra_heads} + else: + extra_predictions = dict() + x = self.actor(embedding) dist = Categorical(logits=F.log_softmax(x, dim=1)) x = self.critic(embedding) value = x.squeeze(1) - return dist, value, memory + return {'dist': dist, 'value': value, 'memory': memory, 'extra_predictions': extra_predictions} def _get_embed_instr(self, instr): if self.lang_model == 'gru': @@ -302,7 +357,7 @@ def _get_embed_instr(self, instr): instr = instr[:, 0:lengths[0]] outputs, h_n = self.instr_rnn(self.word_embedding(instr)) iperm_idx = None - h_n = h_n.transpose(0,1).contiguous() + h_n = h_n.transpose(0, 1).contiguous() h_n = h_n.view(h_n.shape[0], -1) if iperm_idx is not None: outputs, _ = pad_packed_sequence(outputs, batch_first=True) @@ -317,7 +372,7 @@ def _get_embed_instr(self, instr): return outputs, hidden, masks elif self.lang_model == 'conv': - inputs = self.word_embedding(instr).unsqueeze(1) # (B,1,T,D) + inputs = self.word_embedding(instr).unsqueeze(1) # (B,1,T,D) inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.instr_convs] inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] diff --git a/babyai/rl/algos/a2c.py b/babyai/rl/algos/a2c.py index d604d395..d7b3fc4a 100644 --- a/babyai/rl/algos/a2c.py +++ b/babyai/rl/algos/a2c.py @@ -46,7 +46,10 @@ def update_parameters(self): # Compute loss - dist, value, memory = self.acmodel(sb.obs, memory * sb.mask) + model_results = self.acmodel(sb.obs, memory * sb.mask) + dist = model_results['dist'] + value = model_results['value'] + memory = model_results['memory'] entropy = dist.entropy().mean() diff --git a/babyai/rl/algos/base.py b/babyai/rl/algos/base.py index fea34bbf..afff104f 100644 --- a/babyai/rl/algos/base.py +++ b/babyai/rl/algos/base.py @@ -4,12 +4,14 @@ from babyai.rl.format import default_preprocess_obss from babyai.rl.utils import DictList, ParallelEnv +from babyai.rl.utils.supervised_losses import ExtraInfoCollector + class BaseAlgo(ABC): """The base class for RL algorithms.""" def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, - value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward): + value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info): """ Initializes a `BaseAlgo` instance. @@ -42,6 +44,9 @@ def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input + aux_info : list + a list of strings corresponding to the name of the extra information + retrieved from the environment for supervised auxiliary losses """ # Store parameters @@ -59,6 +64,7 @@ def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward + self.aux_info = aux_info # Store helpers values @@ -87,6 +93,9 @@ def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, self.advantages = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) + if self.aux_info: + self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device) + # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) @@ -124,10 +133,18 @@ def collect_experiences(self): preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): - dist, value, memory = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) + model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) + dist = model_results['dist'] + value = model_results['value'] + memory = model_results['memory'] + extra_predictions = model_results['extra_predictions'] + action = dist.sample() - obs, reward, done, _ = self.env.step(action.cpu().numpy()) + obs, reward, done, env_info = self.env.step(action.cpu().numpy()) + if self.aux_info: + env_info = self.aux_info_collector.process(env_info) + # env_info = self.process_aux_info(env_info) # Update experiences values @@ -150,6 +167,9 @@ def collect_experiences(self): self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) + if self.aux_info: + self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions) + # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) @@ -171,7 +191,7 @@ def collect_experiences(self): preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): - _, next_value, _ = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) + next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value'] for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask @@ -204,6 +224,9 @@ def collect_experiences(self): exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) + if self.aux_info: + exps = self.aux_info_collector.end_collection(exps) + # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) diff --git a/babyai/rl/algos/ppo.py b/babyai/rl/algos/ppo.py index 93c2ca08..0d780755 100644 --- a/babyai/rl/algos/ppo.py +++ b/babyai/rl/algos/ppo.py @@ -3,23 +3,29 @@ import torch.nn.functional as F from babyai.rl.algos.base import BaseAlgo +from babyai.rl.utils.supervised_losses import SupervisedLossUpdater class PPOAlgo(BaseAlgo): """The class for the Proximal Policy Optimization algorithm ([Schulman et al., 2015](https://arxiv.org/abs/1707.06347)).""" - def __init__(self, envs, acmodel, num_frames_per_proc=None, discount=0.99, lr=7e-4, beta1=0.9, beta2=0.999, gae_lambda=0.95, + def __init__(self, envs, acmodel, num_frames_per_proc=None, discount=0.99, lr=7e-4, beta1=0.9, beta2=0.999, + gae_lambda=0.95, entropy_coef=0.01, value_loss_coef=0.5, max_grad_norm=0.5, recurrence=4, adam_eps=1e-5, clip_eps=0.2, epochs=4, batch_size=256, preprocess_obss=None, - reshape_reward=None): + reshape_reward=None, aux_info=None, supervised_loss_coef=None): num_frames_per_proc = num_frames_per_proc or 128 super().__init__(envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, - value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward) + value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, + aux_info) self.clip_eps = clip_eps self.epochs = epochs self.batch_size = batch_size + self.supervised_loss_coef = supervised_loss_coef + + self.supervised_loss_updater = SupervisedLossUpdater(aux_info, supervised_loss_coef, recurrence, self.device) assert self.batch_size % self.recurrence == 0 @@ -30,6 +36,18 @@ def update_parameters(self): # Collect experiences exps, logs = self.collect_experiences() + ''' + exps is a DictList with the following keys ['obs', 'memory', 'mask', 'action', 'value', 'reward', + 'advantage', 'returnn', 'log_prob'] and ['collected_info', 'extra_predictions'] if we use aux_info + exps.obs is a DictList with the following keys ['image', 'instr'] + exps.obj.image is a (n_procs * n_frames_per_proc) x image_size 4D tensor + exps.obs.instr is a (n_procs * n_frames_per_proc) x (max number of words in an instruction) 2D tensor + exps.memory is a (n_procs * n_frames_per_proc) x (memory_size = 2*image_embedding_size) 2D tensor + exps.mask is (n_procs * n_frames_per_proc) x 1 2D tensor + if we use aux_info: exps.collected_info and exps.extra_predictions are DictLists with keys + being the added information. They are either (n_procs * n_frames_per_proc) 1D tensors or + (n_procs * n_frames_per_proc) x k 2D tensors where k is the number of classes for multiclass classification + ''' for _ in range(self.epochs): # Initialize log values @@ -40,7 +58,20 @@ def update_parameters(self): log_value_losses = [] log_grad_norms = [] + log_losses = [] + if self.aux_info: + self.supervised_loss_updater.init_epoch() + + ''' + For each epoch, we create int(total_frames / batch_size + 1) batches, each of size batch_size (except + maybe the last one. Each batch is divided into sub-batches of size recurrence (frames are contiguous in + a sub-batch), but the position of each sub-batch in a batch and the position of each batch in the whole + list of frames is random thanks to self._get_batches_starting_indexes(). + ''' + for inds in self._get_batches_starting_indexes(): + # inds is a numpy array of indices that correspond to the beginning of a sub-batch + # there are as many inds as there are batches # Initialize batch values batch_entropy = 0 @@ -48,6 +79,8 @@ def update_parameters(self): batch_policy_loss = 0 batch_value_loss = 0 batch_loss = 0 + if self.aux_info: + self.supervised_loss_updater.init_batch() # Initialize memory @@ -55,12 +88,15 @@ def update_parameters(self): for i in range(self.recurrence): # Create a sub-batch of experience - sb = exps[inds + i] # Compute loss - dist, value, memory = self.acmodel(sb.obs, memory * sb.mask) + model_results = self.acmodel(sb.obs, memory * sb.mask) + dist = model_results['dist'] + value = model_results['value'] + memory = model_results['memory'] + extra_predictions = model_results['extra_predictions'] entropy = dist.entropy().mean() @@ -76,6 +112,10 @@ def update_parameters(self): loss = policy_loss - self.entropy_coef * entropy + self.value_loss_coef * value_loss + if self.aux_info: + supervised_loss = self.supervised_loss_updater.eval_subbatch(extra_predictions, sb) + loss += supervised_loss + # Update batch values batch_entropy += entropy.item() @@ -96,6 +136,8 @@ def update_parameters(self): batch_policy_loss /= self.recurrence batch_value_loss /= self.recurrence batch_loss /= self.recurrence + if self.aux_info: + self.supervised_loss_updater.update_batch_values() # Update actor-critic @@ -112,6 +154,9 @@ def update_parameters(self): log_policy_losses.append(batch_policy_loss) log_value_losses.append(batch_value_loss) log_grad_norms.append(grad_norm) + log_losses.append(batch_loss.item()) + if self.aux_info: + self.supervised_loss_updater.update_epoch_logs() # Log some values @@ -120,6 +165,9 @@ def update_parameters(self): logs["policy_loss"] = numpy.mean(log_policy_losses) logs["value_loss"] = numpy.mean(log_value_losses) logs["grad_norm"] = numpy.mean(log_grad_norms) + logs["loss"] = numpy.mean(log_losses) + if self.aux_info: + logs = self.supervised_loss_updater.end_training(logs) return logs @@ -137,6 +185,6 @@ def _get_batches_starting_indexes(self): indexes = numpy.random.permutation(indexes) num_indexes = self.batch_size // self.recurrence - batches_starting_indexes = [indexes[i:i+num_indexes] for i in range(0, len(indexes), num_indexes)] + batches_starting_indexes = [indexes[i:i + num_indexes] for i in range(0, len(indexes), num_indexes)] return batches_starting_indexes diff --git a/babyai/rl/utils/supervised_losses.py b/babyai/rl/utils/supervised_losses.py new file mode 100644 index 00000000..16e3fa02 --- /dev/null +++ b/babyai/rl/utils/supervised_losses.py @@ -0,0 +1,177 @@ +import torch + +import torch.nn.functional as F +import numpy +from babyai.rl.utils import DictList + +# dictionary that defines what head is required for each extra info used for auxiliary supervision +required_heads = {'seen_state': 'binary', + 'see_door': 'binary', + 'see_obj': 'binary', + 'obj_in_instr': 'binary', + 'in_front_of_what': 'multiclass9', # multi class classifier with 9 possible classes + 'visit_proportion': 'continuous01', # continous regressor with outputs in [0, 1] + 'bot_action': 'binary' + } + +class ExtraInfoCollector: + ''' + This class, used in rl.algos.base, allows connecting the extra information from the environment, and the + corresponding predictions using the specific heads in the model. It transforms them so that they are easy to use + to evaluate losses + ''' + def __init__(self, aux_info, shape, device): + self.aux_info = aux_info + self.shape = shape + self.device = device + + self.collected_info = dict() + self.extra_predictions = dict() + for info in self.aux_info: + self.collected_info[info] = torch.zeros(*shape, device=self.device) + if required_heads[info] == 'binary' or required_heads[info].startswith('continuous'): + # we predict one number only + self.extra_predictions[info] = torch.zeros(*shape, 1, device=self.device) + elif required_heads[info].startswith('multiclass'): + # means that this is a multi-class classification and we need to predict the whole proba distr + n_classes = int(required_heads[info].replace('multiclass', '')) + self.extra_predictions[info] = torch.zeros(*shape, n_classes, device=self.device) + else: + raise ValueError("{} not supported".format(required_heads[info])) + + def process(self, env_info): + # env_info is now a tuple of dicts + env_info = [{k: v for k, v in dic.items() if k in self.aux_info} for dic in env_info] + env_info = {k: [env_info[_][k] for _ in range(len(env_info))] for k in env_info[0].keys()} + # env_info is now a dict of lists + return env_info + + def fill_dictionaries(self, index, env_info, extra_predictions): + for info in self.aux_info: + dtype = torch.long if required_heads[info].startswith('multiclass') else torch.float + self.collected_info[info][index] = torch.tensor(env_info[info], dtype=dtype, device=self.device) + self.extra_predictions[info][index] = extra_predictions[info] + + def end_collection(self, exps): + collected_info = dict() + extra_predictions = dict() + for info in self.aux_info: + # T x P -> P x T -> P * T + collected_info[info] = self.collected_info[info].transpose(0, 1).reshape(-1) + if required_heads[info] == 'binary' or required_heads[info].startswith('continuous'): + # T x P x 1 -> P x T x 1 -> P * T + extra_predictions[info] = self.extra_predictions[info].transpose(0, 1).reshape(-1) + elif type(required_heads[info]) == int: + # T x P x k -> P x T x k -> (P * T) x k + k = required_heads[info] # number of classes + extra_predictions[info] = self.extra_predictions[info].transpose(0, 1).reshape(-1, k) + # convert the dicts to DictLists, and add them to the exps DictList. + exps.collected_info = DictList(collected_info) + exps.extra_predictions = DictList(extra_predictions) + + return exps + + +class SupervisedLossUpdater: + ''' + This class, used by PPO, allows the evaluation of the supervised loss when using extra information from the + environment. It also handles logging accuracies/L2 distances/etc... + ''' + def __init__(self, aux_info, supervised_loss_coef, recurrence, device): + self.aux_info = aux_info + self.supervised_loss_coef = supervised_loss_coef + self.recurrence = recurrence + self.device = device + + self.log_supervised_losses = [] + self.log_supervised_accuracies = [] + self.log_supervised_L2_losses = [] + self.log_supervised_prevalences = [] + + self.batch_supervised_loss = 0 + self.batch_supervised_accuracy = 0 + self.batch_supervised_L2_loss = 0 + self.batch_supervised_prevalence = 0 + + def init_epoch(self): + self.log_supervised_losses = [] + self.log_supervised_accuracies = [] + self.log_supervised_L2_losses = [] + self.log_supervised_prevalences = [] + + def init_batch(self): + self.batch_supervised_loss = 0 + self.batch_supervised_accuracy = 0 + self.batch_supervised_L2_loss = 0 + self.batch_supervised_prevalence = 0 + + def eval_subbatch(self, extra_predictions, sb): + supervised_loss = torch.tensor(0., device=self.device) + supervised_accuracy = torch.tensor(0., device=self.device) + supervised_L2_loss = torch.tensor(0., device=self.device) + supervised_prevalence = torch.tensor(0., device=self.device) + + binary_classification_tasks = 0 + classification_tasks = 0 + regression_tasks = 0 + + for pos, info in enumerate(self.aux_info): + coef = self.supervised_loss_coef[pos] + pred = extra_predictions[info] + target = dict.__getitem__(sb.collected_info, info) + if required_heads[info] == 'binary': + binary_classification_tasks += 1 + classification_tasks += 1 + supervised_loss += coef * F.binary_cross_entropy_with_logits(pred.reshape(-1), target) + supervised_accuracy += ((pred.reshape(-1) > 0).float() == target).float().mean() + supervised_prevalence += target.mean() + elif required_heads[info].startswith('continuous'): + regression_tasks += 1 + mse = F.mse_loss(pred.reshape(-1), target) + supervised_loss += coef * mse + supervised_L2_loss += mse + elif required_heads[info].startswith('multiclass'): + classification_tasks += 1 + supervised_accuracy += (pred.argmax(1).float() == target).float().mean() + supervised_loss += coef * F.cross_entropy(pred, target.long()) + else: + raise ValueError("{} not supported".format(required_heads[info])) + if binary_classification_tasks > 0: + supervised_prevalence /= binary_classification_tasks + else: + supervised_prevalence = torch.tensor(-1) + if classification_tasks > 0: + supervised_accuracy /= classification_tasks + else: + supervised_accuracy = torch.tensor(-1) + if regression_tasks > 0: + supervised_L2_loss /= regression_tasks + else: + supervised_L2_loss = torch.tensor(-1) + + self.batch_supervised_loss += supervised_loss.item() + self.batch_supervised_accuracy += supervised_accuracy.item() + self.batch_supervised_L2_loss += supervised_L2_loss.item() + self.batch_supervised_prevalence += supervised_prevalence.item() + + return supervised_loss + + def update_batch_values(self): + self.batch_supervised_loss /= self.recurrence + self.batch_supervised_accuracy /= self.recurrence + self.batch_supervised_L2_loss /= self.recurrence + self.batch_supervised_prevalence /= self.recurrence + + def update_epoch_logs(self): + self.log_supervised_losses.append(self.batch_supervised_loss) + self.log_supervised_accuracies.append(self.batch_supervised_accuracy) + self.log_supervised_L2_losses.append(self.batch_supervised_L2_loss) + self.log_supervised_prevalences.append(self.batch_supervised_prevalence) + + def end_training(self, logs): + logs["supervised_loss"] = numpy.mean(self.log_supervised_losses) + logs["supervised_accuracy"] = numpy.mean(self.log_supervised_accuracies) + logs["supervised_L2_loss"] = numpy.mean(self.log_supervised_L2_losses) + logs["supervised_prevalence"] = numpy.mean(self.log_supervised_prevalences) + + return logs diff --git a/babyai/utils/agent.py b/babyai/utils/agent.py index d196d28b..6b76add4 100644 --- a/babyai/utils/agent.py +++ b/babyai/utils/agent.py @@ -53,7 +53,10 @@ def act_batch(self, many_obs): preprocessed_obs = self.obss_preprocessor(many_obs, device=self.device) with torch.no_grad(): - dist, value, self.memory = self.model(preprocessed_obs, self.memory) + model_results = self.model(preprocessed_obs, self.memory) + dist = model_results['dist'] + value = model_results['value'] + self.memory = model_results['memory'] if self.argmax: action = dist.probs.max(1, keepdim=True)[1] diff --git a/environment.yaml b/environment.yaml index 4e12830b..12ad1315 100644 --- a/environment.yaml +++ b/environment.yaml @@ -4,7 +4,7 @@ channels: - defaults dependencies: - python=3.6 - - pytorch=0.4 + - pytorch=0.4.1 - torchvision - pyqt - numpy diff --git a/scripts/enjoy.py b/scripts/enjoy.py index bfb0e37d..0820975e 100644 --- a/scripts/enjoy.py +++ b/scripts/enjoy.py @@ -62,7 +62,6 @@ print("Mission: {}".format(obs["mission"])) # Define agent - agent = utils.load_agent(args, env) # Run the agent diff --git a/scripts/make_agent_demos.py b/scripts/make_agent_demos.py index 59e7c628..668720c5 100644 --- a/scripts/make_agent_demos.py +++ b/scripts/make_agent_demos.py @@ -152,7 +152,7 @@ def generate_demos(n_episodes, valid, seed, shift=0): command += sys.argv for i in range(args.jobs): cmd_i = list(map(str, - command + command + ['--seed', args.seed + i] + ['--demos', job_demo_names[i]] + ['--episodes', demos_per_job] @@ -194,4 +194,4 @@ def generate_demos(n_episodes, valid, seed, shift=0): # Validation demos if args.valid_episodes: - generate_demos(args.valid_episodes, True, 0) + generate_demos(args.valid_episodes, True, 0) \ No newline at end of file diff --git a/scripts/train_rl.py b/scripts/train_rl.py index 57d30bd9..4092baa6 100644 --- a/scripts/train_rl.py +++ b/scripts/train_rl.py @@ -21,6 +21,7 @@ import babyai.rl from babyai.model import ACModel from babyai.levels import curriculums, create_menvs +from babyai.levels.supervised_losses import wrap_env from babyai.evaluate import batch_evaluate from babyai.utils.agent import ModelAgent @@ -95,6 +96,14 @@ help="don't use memory in the model") parser.add_argument("--arch", default='expert_filmcnn', help="image embedding architecture") +parser.add_argument("--aux-loss", nargs='*', default=[], + help="List of extra information that the environment yields at each step" + "The agent tries to learn that using a supervised loss. If not specified, no info is used" + "Possible infos: seen_state, see_door, see_obj, in_front_of_what, visit_proportion, " + "obj_in_instr, bot_action") +parser.add_argument("--aux-loss-coef", nargs='*', type=float, default=[], + help="Coefficients for the auxiliary supervised loss. There should be as many as extra infos" + "If not specified, they will all be set to 1") parser.add_argument("--test-seed", type=int, default=0, help="random seed for testing (default: 0)") parser.add_argument("--test-episodes", type=int, default=200, @@ -106,6 +115,10 @@ assert args.env is not None or args.curriculum is not None, "--env or --curriculum must be specified." +if len(args.aux_loss_coef) == 0: + args.aux_loss_coef = [1.] * len(args.aux_loss) +assert len(args.aux_loss) == len(args.aux_loss_coef) + # Set seed for all randomness sources if args.seed == 0: @@ -122,6 +135,8 @@ envs = [] for i in range(args.procs): env = gym.make(args.env) + if args.aux_loss: + env = wrap_env(env, args.aux_loss) env.seed(100 * args.seed + i) envs.append(env) else: @@ -140,8 +155,13 @@ 'instr': instr, 'mem': mem, 'seed': args.seed, + 'info': '', + 'coef': '', 'suffix': suffix} -default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}_{suffix}".format(**model_name_parts) +if len(args.aux_loss) > 0: + model_name_parts['info'] = '_' + ''.join([info[0].upper() for info in args.aux_loss]) + model_name_parts['coef'] = '_' + '-'.join(map(str, args.aux_loss_coef)) +default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}{info}{coef}_{suffix}".format(**model_name_parts) model_name = args.model.format(**model_name_parts) if args.model else default_model_name if args.pretrained_model: model_name = args.pretrained_model + '_pretrained_' + default_model_name @@ -164,11 +184,12 @@ if acmodel is None: acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, - not args.no_instr, args.instr_arch, not args.no_mem, args.arch) + not args.no_instr, args.instr_arch, not args.no_mem, args.arch, args.aux_loss) utils.save_model(acmodel, model_name) if torch.cuda.is_available(): acmodel.cuda() - +if len(args.aux_loss) > 0: + acmodel.add_extra_heads_if_necessary(args.aux_loss) # Define actor-critic algo @@ -181,10 +202,17 @@ algo = babyai.rl.PPOAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, obss_preprocessor, - reshape_reward) + reshape_reward, args.aux_loss, args.aux_loss_coef) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) +# When using extra binary information, more tensors (model params) are initialized compared to when we don't use that. +# Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that +# the results of supervised-loss-coef=0. and extra-binary-info=0 match, we need to reseed here. + +utils.seed(args.seed) + + # Restore training status status_path = os.path.join(utils.get_log_dir(model_name), 'status.json') @@ -201,7 +229,9 @@ + ["return_" + stat for stat in ['mean', 'std', 'min', 'max']] + ["success_rate"] + ["num_frames_" + stat for stat in ['mean', 'std', 'min', 'max']] - + ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]) + + ["entropy", "value", "policy_loss", "value_loss", "loss", "grad_norm"]) +if args.aux_loss: + header += ["supervised_loss", "supervised_accuracy", "supervised_L2_loss", "supervised_prevalence"] if args.curriculum is not None: for env_key in curriculum: header.append("proba/{}".format(env_key)) @@ -274,16 +304,22 @@ success_per_episode['mean'], *num_frames_per_episode.values(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], - logs["grad_norm"]] + logs["loss"], logs["grad_norm"]] + format_str = ("U {} | F {:06} | FPS {:04.0f} | D {} | R:xsmM {: .2f} {: .2f} {: .2f} {: .2f} | " "S {:.2f} | F:xsmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | " - "pL {: .3f} | vL {:.3f} | gN {:.3f} | ") + "pL {: .3f} | vL {:.3f} | L {:.3f} | gN {:.3f} | ") + if args.aux_loss: + data += [logs["supervised_loss"], logs["supervised_accuracy"], logs["supervised_L2_loss"], + logs["supervised_prevalence"]] + format_str += "sL {: .3f} | sA {:.3f} | sL2 {: .3f} | sP {: .3f} | " if args.curriculum is not None: for env_id, _ in enumerate(curriculum): data.append(menv_head.dist[env_id]) data.append(menv_head.synthesized_returns.get(env_id, np.NaN)) format_str += "pr{} {:.2f} | ".format(env_id, menv_head.dist[env_id]) format_str += "R{} {:.2f} | ".format(env_id, menv_head.synthesized_returns.get(env_id, np.NaN)) + logger.info(format_str.format(*data)) if args.tb: assert len(header) == len(data) diff --git a/setup.py b/setup.py index cfdfcde4..8daf3ecd 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ 'gym>=0.9.6', 'numpy>=1.10.0', 'pyqt5>=5.10.1', - "torch>=0.4.0", + "torch>=0.4.1", 'gym_minigrid', 'blosc>=1.5.1' ],