From bed8016e82d24b8eea8894d660a78da03306549f Mon Sep 17 00:00:00 2001 From: PaulDaoudi Date: Sat, 27 Jan 2024 18:12:58 +0100 Subject: [PATCH] Rllg branch (#68) * Add dosctrings, types and docker * Add dosctrings, types and docker --- .gitignore | 1 + RLLG/.dockerignore | 9 + RLLG/LICENSE | 40 +- RLLG/README.md | 106 +- RLLG/agents/algos/pag.py | 636 +++++++----- RLLG/agents/algos/pig.py | 575 ++++++----- RLLG/agents/algos/sac.py | 473 +++++---- RLLG/agents/algos/sag.py | 531 +++++----- RLLG/agents/common/config.py | 67 ++ RLLG/agents/common/creation_utils.py | 150 +++ RLLG/agents/common/model.py | 950 ++++++++++++------ RLLG/agents/common/replay_buffer.py | 583 +++++++---- RLLG/agents/common/sampler.py | 466 +++++---- RLLG/agents/common/utils.py | 187 ++-- RLLG/agents/common/visualization_helpers.py | 285 +++--- RLLG/docker/Dockerfile | 29 + RLLG/envs/ball_in_cup/confidence.py | 121 ++- RLLG/envs/ball_in_cup/create_ball_in_cup.py | 132 ++- RLLG/envs/ball_in_cup/local_expert_policy.py | 121 ++- .../ball_in_cup/models/near_expert_sac_650 | Bin 0 -> 22735 bytes .../ball_in_cup/models/near_expert_sac_780 | Bin 0 -> 22735 bytes .../bullet_small_reach/bullet_small_reach.py | 110 +- RLLG/envs/bullet_small_reach/confidence.py | 115 ++- .../create_bullet_small_reach.py | 88 +- .../bullet_small_reach/local_expert_policy.py | 152 ++- RLLG/envs/cartpole/confidence.py | 137 ++- RLLG/envs/cartpole/create_cartpole.py | 174 ++-- RLLG/envs/cartpole/local_expert_policy.py | 68 +- RLLG/envs/confidence.py | 107 +- RLLG/envs/creation.py | 158 +-- RLLG/envs/env_utils.py | 171 ++-- RLLG/envs/hirl_point_fall/confidence.py | 104 +- .../hirl_point_fall/create_hirl_point_fall.py | 92 +- .../hirl_point_fall/local_expert_policy.py | 122 ++- RLLG/envs/hirl_point_fall/wrapper.py | 109 +- RLLG/envs/point_circle/confidence.py | 105 +- RLLG/envs/point_circle/create_point_circle.py | 103 +- RLLG/envs/point_circle/local_expert_policy.py | 121 ++- RLLG/envs/point_circle/point_circle.py | 106 +- RLLG/envs/point_mass/confidence.py | 120 ++- RLLG/envs/point_mass/create_point_mass.py | 139 ++- RLLG/envs/point_mass/local_expert_policy.py | 121 ++- RLLG/main.py | 260 ++--- RLLG/notebooks/Visualization.ipynb | 308 +++--- RLLG/notebooks/helpers.py | 145 ++- RLLG/notebooks/video_fn.py | 117 ++- RLLG/ray_config/ball_in_cup_cfg.yaml | 130 +-- RLLG/ray_config/bullet_small_reach_cfg.yaml | 142 +-- RLLG/ray_config/cartpole_cfg.yaml | 162 +-- RLLG/ray_config/hirl_point_fall_cfg.yaml | 132 +-- RLLG/ray_config/point_circle_cfg.yaml | 142 +-- RLLG/ray_config/point_mass_cfg.yaml | 132 +-- RLLG/requirements.txt | 23 + RLLG/sac_main_fn.py | 183 ++-- RLLG/setup.py | 62 +- 55 files changed, 6175 insertions(+), 3747 deletions(-) create mode 100644 RLLG/.dockerignore create mode 100644 RLLG/agents/common/config.py create mode 100644 RLLG/agents/common/creation_utils.py create mode 100644 RLLG/docker/Dockerfile create mode 100644 RLLG/envs/ball_in_cup/models/near_expert_sac_650 create mode 100644 RLLG/envs/ball_in_cup/models/near_expert_sac_780 create mode 100644 RLLG/requirements.txt diff --git a/.gitignore b/.gitignore index 85e7c1df..51429500 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /.idea/ +*.DS_Store diff --git a/RLLG/.dockerignore b/RLLG/.dockerignore new file mode 100644 index 00000000..bd17c07d --- /dev/null +++ b/RLLG/.dockerignore @@ -0,0 +1,9 @@ +__pycache__/ +ray_results +.idea/ +logs/ +.pytype/ +.vscode/ +# ignore for docker builds +.git/ +.mypy_cache/ \ No newline at end of file diff --git a/RLLG/LICENSE b/RLLG/LICENSE index 4ad327d9..e341872e 100644 --- a/RLLG/LICENSE +++ b/RLLG/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2023 Paul Daoudi - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +MIT License + +Copyright (c) 2023 Paul Daoudi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/RLLG/README.md b/RLLG/README.md index 8649898e..c7859575 100644 --- a/RLLG/README.md +++ b/RLLG/README.md @@ -1,49 +1,57 @@ -# Enhancing Reinforcement Learning agents with Local Guides - -This is the official implementation of the techniques discussed in the paper [Enhancing Reinforcement Learning agents with Local Guides](https://hal.science/hal-04052358/file/Final_Reinforcement_Learning_with_Local_Guides.pdf). - -## Steps to launch it for a new environment - -- In the folder `envs`, create a folder with the name of the environment with 3 files: - - `create_env_name` to create the environment - - `local_expert_policy` for the local expert - - `confidence` for the confidence function $\lambda$ -- Add the environment in the global files `creation` and `confidence` in `envs` -- Add a config file in `ray_config` -- Modify the `main` file to include the new environment -- Enjoy :) - -## Notes regarding the Point-Reach environment - -PointReach is based on [Bullet-Safety-Gym](https://github.com/SvenGronauer/Bullet-Safety-Gym), and has been modified internally (directly in their source code) to make it more difficult. - -All the details can be found in Appendix B of the paper. - -## Visualization - -All the results are saved in a ray tune `Experimentanalysis`. You can plot them in the `Visualization.ipynb` notebook. - -## License - -We follow MIT License. Please see the [License](./LICENSE) file for more information. - -**Disclaimer:** This is not an officially supported Huawei Product. - - -## Credits - -This code is built upon the [SimpleSAC Github](https://github.com/young-geng/SimpleSAC), and some wrappers of [gym](https://github.com/openai/gym/tree/master). - - -## Cite us - -If you find this technique useful and you use it in a project, please cite it: -``` -@inproceedings{daoudi2023enhancing, - title={Enhancing Reinforcement Learning Agents with Local Guides}, - author={Daoudi, Paul and Robu, Bogdan and Prieur, Christophe and Dos Santos, Ludovic and Barlier, Merwan}, - booktitle={Proceedings of the 2023 International Conference on Autonomous Agents and Multiagent Systems}, - pages={829--838}, - year={2023} -} -``` +# Enhancing Reinforcement Learning agents with Local Guides + +This is the official implementation of the techniques discussed in the paper [Enhancing Reinforcement Learning agents with Local Guides](https://hal.science/hal-04052358/file/Final_Reinforcement_Learning_with_Local_Guides.pdf). + +## Create the conda virtual environment + +``` +conda create --name rllg python=3.8 +conda activate rllg +pip install -e . +``` + +## Steps to launch it for a new environment + +- In the folder `envs`, create a folder with the name of the environment with 3 files: + - `create_env_name` to create the environment + - `local_expert_policy` for the local expert + - `confidence` for the confidence function $\lambda$ +- Add the environment in the global files `creation` and `confidence` in `envs` +- Add a config file in `ray_config` +- Modify the `main` file to include the new environment +- Enjoy :) + +## Notes regarding the Point-Reach environment + +PointReach is based on [Bullet-Safety-Gym](https://github.com/SvenGronauer/Bullet-Safety-Gym), and has been modified internally (directly in their source code) to make it more difficult. + +All the details can be found in Appendix B of the paper. + +## Visualization + +All the results are saved in a ray tune `Experimentanalysis`. You can plot them in the `Visualization.ipynb` notebook. + +## License + +We follow MIT License. Please see the [License](./LICENSE) file for more information. + +**Disclaimer:** This is not an officially supported Huawei Product. + + +## Credits + +This code is built upon the [SimpleSAC Github](https://github.com/young-geng/SimpleSAC), and some wrappers of [gym](https://github.com/openai/gym/tree/master). + + +## Cite us + +If you find this technique useful and you use it in a project, please cite it: +``` +@inproceedings{daoudi2023enhancing, + title={Enhancing Reinforcement Learning Agents with Local Guides}, + author={Daoudi, Paul and Robu, Bogdan and Prieur, Christophe and Dos Santos, Ludovic and Barlier, Merwan}, + booktitle={Proceedings of the 2023 International Conference on Autonomous Agents and Multiagent Systems}, + pages={829--838}, + year={2023} +} +``` diff --git a/RLLG/agents/algos/pag.py b/RLLG/agents/algos/pag.py index 1bf65f3e..3f6106b9 100644 --- a/RLLG/agents/algos/pag.py +++ b/RLLG/agents/algos/pag.py @@ -1,260 +1,376 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -from ml_collections import ConfigDict -import numpy as np -import torch -import torch.optim as optim -import torch.nn.functional as F -from agents.common.model import Scalar, soft_target_update - - -class PAG(object): - - @staticmethod - def get_default_config(updates=None): - config = ConfigDict() - config.discount = 0.99 - config.reward_scale = 1.0 - config.alpha_multiplier = 1.0 - config.use_automatic_entropy_tuning = True - config.use_automatic_entropy_tuning_parametrized_perturbation = True - config.backup_entropy = True - config.target_entropy = 0.0 - config.policy_lr = 3e-4 - config.qf_lr = 3e-4 - config.optimizer_type = 'adam' - config.soft_target_update_rate = 5e-3 - config.target_update_period = 1 - - if updates is not None: - config.update(ConfigDict(updates).copy_and_resolve_references()) - return config - - def __init__(self, config, policy, sampler_policy, qf1, qf2, target_qf1, target_qf2, - use_local, local_expert, parametrized_perturbation, sampler_parametrized_perturbation): - self.config = PAG.get_default_config(config) - self.policy = policy - self.sampler_policy = sampler_policy - self.qf1 = qf1 - self.qf2 = qf2 - self.target_qf1 = target_qf1 - self.target_qf2 = target_qf2 - self.parametrized_perturbation = parametrized_perturbation - self.sampler_parametrized_perturbation = sampler_parametrized_perturbation - - # hyperparams - self.use_local = use_local - self.beta = 1. - self.local_expert = local_expert - - optimizer_class = { - 'adam': optim.Adam, - 'sgd': optim.SGD, - }[self.config.optimizer_type] - - self.policy_optimizer = optimizer_class( - self.policy.parameters(), self.config.policy_lr, - ) - self.qf_optimizer = optimizer_class( - list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr - ) - self.parametrized_perturbation_optimizer = optimizer_class( - self.parametrized_perturbation.parameters(), self.config.policy_lr, - ) - - if self.config.use_automatic_entropy_tuning: - self.log_alpha = Scalar(0.0) - self.alpha_optimizer = optimizer_class( - self.log_alpha.parameters(), - lr=self.config.policy_lr, - ) - else: - self.log_alpha = None - - if self.config.use_automatic_entropy_tuning_parametrized_perturbation: - self.expert_log_alpha = Scalar(0.0) - self.expert_alpha_optimizer = optimizer_class( - self.expert_log_alpha.parameters(), - lr=self.config.policy_lr, - ) - else: - self.expert_log_alpha = None - - self.update_target_network(1.0) - self._total_steps = 0 - - def update_target_network(self, soft_target_update_rate): - soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) - soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) - - def train(self, batch, batch_success=None): - self._total_steps += 1 - - # classic obs - observations = batch['observations'] - actions = batch['actions'] - rewards = batch['rewards'] - next_observations = batch['next_observations'] - dones = batch['dones'] - - # retrieve local experts information - lambda_s_current = batch['use_local_current'] - lambda_s_next = batch['use_local_next'] - expert_actions = batch['expert_actions'] - next_expert_actions = batch['next_expert_actions'] - - new_actions, log_pi = self.policy(observations) - - if self.config.use_automatic_entropy_tuning: - alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() - alpha = self.log_alpha().exp() * self.config.alpha_multiplier - else: - alpha_loss = observations.new_tensor(0.0) - alpha = observations.new_tensor(self.config.alpha_multiplier) - - """ Policy loss """ - q_new_actions = torch.min( - self.qf1(observations, new_actions), - self.qf2(observations, new_actions), - ) - policy_loss = (alpha*log_pi - q_new_actions).mean() - - """ Parametrized noise loss""" - parametrized_perturbation_actions, expert_log_pi = self.parametrized_perturbation(observations, expert_actions) - - if self.config.use_automatic_entropy_tuning_parametrized_perturbation: - expert_alpha_loss = -(self.expert_log_alpha() * (expert_log_pi + self.config.target_entropy).detach()).mean() - expert_alpha = self.expert_log_alpha().exp() * self.config.expert_alpha_multiplier - else: - expert_alpha_loss = observations.new_tensor(0.0) - expert_alpha = observations.new_tensor(self.config.expert_alpha_multiplier) - - q_new_actions_perturbed = lambda_s_current * torch.min( - self.qf1(observations, parametrized_perturbation_actions), - self.qf2(observations, parametrized_perturbation_actions), - ) - parametrized_perturbation_loss = (expert_alpha * expert_log_pi - q_new_actions_perturbed).mean() - - """ Q function loss """ - q1_pred = self.qf1(observations, actions) - q2_pred = self.qf2(observations, actions) - - with torch.no_grad(): - new_next_actions, next_log_pi = self.policy(next_observations) - - next_log_pi = (1 - lambda_s_next) * next_log_pi - - expert_target_q_values = torch.min( - self.target_qf1(next_observations, next_expert_actions), - self.target_qf2(next_observations, next_expert_actions), - ) - classic_target_q_values = torch.min( - self.target_qf1(next_observations, new_next_actions), - self.target_qf2(next_observations, new_next_actions), - ) - target_q_values = lambda_s_next * expert_target_q_values + \ - (1 - lambda_s_next) * classic_target_q_values - - if self.config.backup_entropy: - target_q_values = target_q_values - alpha * next_log_pi - - q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values - qf1_loss = F.mse_loss(q1_pred, q_target.detach()) - qf2_loss = F.mse_loss(q2_pred, q_target.detach()) - qf_loss = qf1_loss + qf2_loss - - if self.config.use_automatic_entropy_tuning: - self.alpha_optimizer.zero_grad() - alpha_loss.backward() - self.alpha_optimizer.step() - - if self.config.use_automatic_entropy_tuning_parametrized_perturbation: - self.expert_alpha_optimizer.zero_grad() - expert_alpha_loss.backward() - self.expert_alpha_optimizer.step() - - self.policy_optimizer.zero_grad() - policy_loss.backward() - self.policy_optimizer.step() - - self.parametrized_perturbation_optimizer.zero_grad() - parametrized_perturbation_loss.backward() - self.parametrized_perturbation_optimizer.step() - - self.qf_optimizer.zero_grad() - qf_loss.backward() - self.qf_optimizer.step() - - if self.total_steps % self.config.target_update_period == 0: - self.update_target_network( - self.config.soft_target_update_rate - ) - - metrics_to_return = dict( - log_pi=log_pi.mean().item(), - policy_loss=policy_loss.item(), - parametrized_perturbation_loss=parametrized_perturbation_loss.item(), - qf1_loss=qf1_loss.item(), - qf2_loss=qf2_loss.item(), - alpha_loss=alpha_loss.item(), - alpha=alpha.item(), - expert_alpha_loss=expert_alpha_loss.item(), - expert_alpha=expert_alpha.item(), - average_qf1=q1_pred.mean().item(), - average_qf2=q2_pred.mean().item(), - average_target_q=target_q_values.mean().item(), - total_steps=self.total_steps, - ) - - return metrics_to_return - - def torch_to_device(self, device): - for module in self.modules: - module.to(device) - - def get_action(self, - env, - observation, - deterministic=False, - add_local_information=False): - """ - In switched agent, the agent always picks the expert action if it is relevant. - """ - - action = self.sampler_policy( - np.expand_dims(observation, 0), deterministic=deterministic - )[0, :] - if add_local_information: - use_local = self.use_local.get_use_local(env, - observation) - expert_action_init = self.local_expert.get_action(observation, - init_action=action, - env=env) - expert_action = self.sampler_parametrized_perturbation( - np.expand_dims(observation, 0), np.expand_dims(expert_action_init, 0), - beta=self.beta, deterministic=deterministic - )[0, :] - if use_local: - return expert_action, use_local, expert_action - return action, use_local, expert_action - return action - - @property - def modules(self): - modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] - if self.config.use_automatic_entropy_tuning: - modules.append(self.log_alpha) - return modules - - @property - def total_steps(self): - return self._total_steps +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional, Tuple, Any, Dict, List, Union +from ml_collections import ConfigDict +import numpy as np +import torch +import torch.optim as optim +import torch.nn.functional as F +from agents.common.model import Scalar, soft_target_update, SamplerPolicy, ExpertSamplerPolicy + + +class PAG(object): + """ + Perturbed Action Guided (PAG) algorithm implementation. + + Parameters: + ----------- + config: dict + Configuration parameters for SAC. + policy: torch.nn.Module + The policy network. + sampler_policy: SamplerPolicy + The sampler policy network. + qf1: torch.nn.Module + The first critic network. + qf2: torch.nn.Module + The second critic network. + target_qf1: torch.nn.Module + The target network for the first critic. + target_qf2: torch.nn.Module + The target network for the second critic. + use_local: float + Float confidence function indicating whether to trust local policies. + local_expert: Any + Local expert. + parametrized_perturbation: Any, optional + The parametrized perturbation network. + sampler_parametrized_perturbation: ExpertSamplerPolicy + The network used for sampling perturbations. + """ + + @staticmethod + def get_default_config(updates: Optional[Dict] = None) -> ConfigDict: + """ + Get the default configuration for PAG. + + Parameters: + ----------- + updates: dict, optional + Optional dictionary to update default configuration. + + Returns: + -------- + ConfigDict + Default configuration for SAC. + """ + config = ConfigDict() + config.discount = 0.99 + config.reward_scale = 1.0 + config.alpha_multiplier = 1.0 + config.use_automatic_entropy_tuning = True + config.use_automatic_entropy_tuning_parametrized_perturbation = True + config.backup_entropy = True + config.target_entropy = 0.0 + config.policy_lr = 3e-4 + config.qf_lr = 3e-4 + config.optimizer_type = 'adam' + config.soft_target_update_rate = 5e-3 + config.target_update_period = 1 + + if updates is not None: + config.update(ConfigDict(updates).copy_and_resolve_references()) + return config + + def __init__(self, + config: Dict, + policy: torch.nn.Module, + sampler_policy: SamplerPolicy, + qf1: torch.nn.Module, + qf2: torch.nn.Module, + target_qf1: torch.nn.Module, + target_qf2: torch.nn.Module, + use_local: float, + local_expert: Any, + parametrized_perturbation: torch.nn.Module, + sampler_parametrized_perturbation: ExpertSamplerPolicy): + self.config = PAG.get_default_config(config) + self.policy = policy + self.sampler_policy = sampler_policy + self.qf1 = qf1 + self.qf2 = qf2 + self.target_qf1 = target_qf1 + self.target_qf2 = target_qf2 + self.parametrized_perturbation = parametrized_perturbation + self.sampler_parametrized_perturbation = sampler_parametrized_perturbation + + # hyperparams + self.use_local = use_local + self.beta = 1. + self.local_expert = local_expert + + optimizer_class = { + 'adam': optim.Adam, + 'sgd': optim.SGD, + }[self.config.optimizer_type] + + self.policy_optimizer = optimizer_class( + self.policy.parameters(), self.config.policy_lr, + ) + self.qf_optimizer = optimizer_class( + list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr + ) + self.parametrized_perturbation_optimizer = optimizer_class( + self.parametrized_perturbation.parameters(), self.config.policy_lr, + ) + + if self.config.use_automatic_entropy_tuning: + self.log_alpha = Scalar(0.0) + self.alpha_optimizer = optimizer_class( + self.log_alpha.parameters(), + lr=self.config.policy_lr, + ) + else: + self.log_alpha = None + + if self.config.use_automatic_entropy_tuning_parametrized_perturbation: + self.expert_log_alpha = Scalar(0.0) + self.expert_alpha_optimizer = optimizer_class( + self.expert_log_alpha.parameters(), + lr=self.config.policy_lr, + ) + else: + self.expert_log_alpha = None + + self.update_target_network(1.0) + self._total_steps = 0 + + def update_target_network(self, soft_target_update_rate: float) -> None: + """ + Update the target networks with soft target updates. + + Parameters: + ----------- + soft_target_update_rate: float + Rate of soft target network updates. + """ + soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) + soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) + + def train(self, batch: Dict[str, Any], batch_success: Optional[Dict[str, torch.Tensor]] = None) -> Dict[ + str, Any]: + """ + Train the PAG agent on a batch of experiences. + + Parameters: + ----------- + batch: dict + A dictionary containing the the transitions. + batch_success: dict, optional + A dictionary containing the the transitions. + + Returns: + -------- + dict + A dictionary containing training metrics. + """ + self._total_steps += 1 + + # classic obs + observations = batch['observations'] + actions = batch['actions'] + rewards = batch['rewards'] + next_observations = batch['next_observations'] + dones = batch['dones'] + + # retrieve local experts information + lambda_s_current = batch['use_local_current'] + lambda_s_next = batch['use_local_next'] + expert_actions = batch['expert_actions'] + next_expert_actions = batch['next_expert_actions'] + + new_actions, log_pi = self.policy(observations) + + if self.config.use_automatic_entropy_tuning: + alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() + alpha = self.log_alpha().exp() * self.config.alpha_multiplier + else: + alpha_loss = observations.new_tensor(0.0) + alpha = observations.new_tensor(self.config.alpha_multiplier) + + """ Policy loss """ + q_new_actions = torch.min( + self.qf1(observations, new_actions), + self.qf2(observations, new_actions), + ) + policy_loss = (alpha*log_pi - q_new_actions).mean() + + """ Parametrized noise loss""" + parametrized_perturbation_actions, expert_log_pi = self.parametrized_perturbation(observations, expert_actions) + + if self.config.use_automatic_entropy_tuning_parametrized_perturbation: + expert_alpha_loss = -(self.expert_log_alpha() * (expert_log_pi + self.config.target_entropy).detach()).mean() + expert_alpha = self.expert_log_alpha().exp() * self.config.expert_alpha_multiplier + else: + expert_alpha_loss = observations.new_tensor(0.0) + expert_alpha = observations.new_tensor(self.config.expert_alpha_multiplier) + + q_new_actions_perturbed = lambda_s_current * torch.min( + self.qf1(observations, parametrized_perturbation_actions), + self.qf2(observations, parametrized_perturbation_actions), + ) + parametrized_perturbation_loss = (expert_alpha * expert_log_pi - q_new_actions_perturbed).mean() + + """ Q function loss """ + q1_pred = self.qf1(observations, actions) + q2_pred = self.qf2(observations, actions) + + with torch.no_grad(): + new_next_actions, next_log_pi = self.policy(next_observations) + + next_log_pi = (1 - lambda_s_next) * next_log_pi + + expert_target_q_values = torch.min( + self.target_qf1(next_observations, next_expert_actions), + self.target_qf2(next_observations, next_expert_actions), + ) + classic_target_q_values = torch.min( + self.target_qf1(next_observations, new_next_actions), + self.target_qf2(next_observations, new_next_actions), + ) + target_q_values = lambda_s_next * expert_target_q_values + \ + (1 - lambda_s_next) * classic_target_q_values + + if self.config.backup_entropy: + target_q_values = target_q_values - alpha * next_log_pi + + q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values + qf1_loss = F.mse_loss(q1_pred, q_target.detach()) + qf2_loss = F.mse_loss(q2_pred, q_target.detach()) + qf_loss = qf1_loss + qf2_loss + + if self.config.use_automatic_entropy_tuning: + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + + if self.config.use_automatic_entropy_tuning_parametrized_perturbation: + self.expert_alpha_optimizer.zero_grad() + expert_alpha_loss.backward() + self.expert_alpha_optimizer.step() + + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + self.parametrized_perturbation_optimizer.zero_grad() + parametrized_perturbation_loss.backward() + self.parametrized_perturbation_optimizer.step() + + self.qf_optimizer.zero_grad() + qf_loss.backward() + self.qf_optimizer.step() + + if self.total_steps % self.config.target_update_period == 0: + self.update_target_network( + self.config.soft_target_update_rate + ) + + metrics_to_return = dict( + log_pi=log_pi.mean().item(), + policy_loss=policy_loss.item(), + parametrized_perturbation_loss=parametrized_perturbation_loss.item(), + qf1_loss=qf1_loss.item(), + qf2_loss=qf2_loss.item(), + alpha_loss=alpha_loss.item(), + alpha=alpha.item(), + expert_alpha_loss=expert_alpha_loss.item(), + expert_alpha=expert_alpha.item(), + average_qf1=q1_pred.mean().item(), + average_qf2=q2_pred.mean().item(), + average_target_q=target_q_values.mean().item(), + total_steps=self.total_steps, + ) + + return metrics_to_return + + def torch_to_device(self, device: torch.device) -> None: + """ + Move all modules to the specified device. + + Parameters: + ----------- + device: torch.device + The target device. + """ + for module in self.modules: + module.to(device) + + def get_action(self, + env: Any, + observation: np.ndarray, + deterministic: bool = False, + add_local_information: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, float, np.ndarray]]: + """ + Get an action from the policy. + + Parameters: + ----------- + env: Any + The environment. + observation: np.ndarray + The current observation. + deterministic: bool, optional + Whether to sample a deterministic action. + add_local_information: bool, optional + Whether to add local information. + + Returns: + -------- + Tuple[np.ndarray, float, np.ndarray] + The action, local information, and expert action. + """ + + action = self.sampler_policy( + np.expand_dims(observation, 0), deterministic=deterministic + )[0, :] + if add_local_information: + use_local = self.use_local.get_use_local(env, + observation) + expert_action_init = self.local_expert.get_action(observation, + init_action=action, + env=env) + expert_action = self.sampler_parametrized_perturbation( + np.expand_dims(observation, 0), np.expand_dims(expert_action_init, 0), + beta=self.beta, deterministic=deterministic + )[0, :] + if use_local: + return expert_action, use_local, expert_action + return action, use_local, expert_action + return action + + @property + def modules(self) -> List[torch.nn.Module]: + """ + Get a list of modules. + + Returns: + -------- + List[nn.Module] + The list of modules including policy, q-functions, and optional log_alpha. + """ + modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] + if self.config.use_automatic_entropy_tuning: + modules.append(self.log_alpha) + return modules + + @property + def total_steps(self) -> int: + """ + Get the total number of steps taken. + + Returns: + -------- + int + The total number of steps. + """ + return self._total_steps diff --git a/RLLG/agents/algos/pig.py b/RLLG/agents/algos/pig.py index 9437f962..073f6300 100644 --- a/RLLG/agents/algos/pig.py +++ b/RLLG/agents/algos/pig.py @@ -1,231 +1,344 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -from ml_collections import ConfigDict -import numpy as np -import torch -import torch.optim as optim -import torch.nn.functional as F - -from agents.common.model import Scalar, soft_target_update - - -class PIG(object): - - @staticmethod - def get_default_config(updates=None): - config = ConfigDict() - config.discount = 0.99 - config.reward_scale = 1.0 - config.alpha_multiplier = 1.0 - config.use_automatic_entropy_tuning = True - config.backup_entropy = True - config.target_entropy = 0.0 - config.policy_lr = 3e-4 - config.qf_lr = 3e-4 - config.optimizer_type = 'adam' - config.soft_target_update_rate = 5e-3 - config.target_update_period = 1 - config.use_automatic_beta_tuning = False - config.target_beta = 0.0 - - if updates is not None: - config.update(ConfigDict(updates).copy_and_resolve_references()) - return config - - def __init__(self, config, policy, sampler_policy, qf1, qf2, target_qf1, target_qf2, - use_local, local_expert, beta): - self.config = PIG.get_default_config(config) - self.policy = policy - self.sampler_policy = sampler_policy - self.qf1 = qf1 - self.qf2 = qf2 - self.target_qf1 = target_qf1 - self.target_qf2 = target_qf2 - - # hyper parameter - self.use_local = use_local - self.local_expert = local_expert - self.beta = beta - self.training = True - - optimizer_class = { - 'adam': optim.Adam, - 'sgd': optim.SGD, - }[self.config.optimizer_type] - - self.policy_optimizer = optimizer_class( - self.policy.parameters(), self.config.policy_lr, - ) - self.qf_optimizer = optimizer_class( - list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr - ) - - if self.config.use_automatic_entropy_tuning: - self.log_alpha = Scalar(0.0) - self.alpha_optimizer = optimizer_class( - self.log_alpha.parameters(), - lr=self.config.policy_lr, - ) - else: - self.log_alpha = None - - if self.config.use_automatic_beta_tuning: - self.log_beta = Scalar(0.0) - self.beta_optimizer = optimizer_class( - self.log_beta.parameters(), - lr=self.config.policy_lr, - ) - else: - self.log_beta = None - - self.update_target_network(1.0) - self._total_steps = 0 - - def update_target_network(self, soft_target_update_rate): - soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) - soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) - - def train(self, batch, batch_success=None): - self._total_steps += 1 - - observations = batch['observations'] - actions = batch['actions'] - rewards = batch['rewards'] - next_observations = batch['next_observations'] - dones = batch['dones'] - - # retrieve local experts information - lambda_s_current = batch['use_local_current'] - # lambda_s_next = batch['use_local_next'] # no need - expert_actions = batch['expert_actions'] - # next_expert_actions = batch['next_expert_actions'] # no need - - new_actions, log_pi = self.policy(observations) - - if self.config.use_automatic_entropy_tuning: - alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() - alpha = self.log_alpha().exp() * self.config.alpha_multiplier - else: - alpha_loss = observations.new_tensor(0.0) - alpha = observations.new_tensor(self.config.alpha_multiplier) - - # It may be possible to tune beta according to a specific loss - if self.config.use_automatic_beta_tuning: - beta_loss = 1 - beta = self.log_beta().exp() - else: - beta_loss = observations.new_tensor(0.0) - beta = observations.new_tensor(self.beta) - - """ Policy loss """ - q_new_actions = torch.min( - self.qf1(observations, new_actions), - self.qf2(observations, new_actions), - ) - policy_loss = (alpha*log_pi - q_new_actions).mean() - - # PolicyGuided learning: loss is minus likelihood of expert action - guided_loss = beta * -(lambda_s_current * self.policy.log_prob(observations, expert_actions)).mean() - - policy_loss = policy_loss + guided_loss - - """ Q function loss """ - q1_pred = self.qf1(observations, actions) - q2_pred = self.qf2(observations, actions) - - with torch.no_grad(): - new_next_actions, next_log_pi = self.policy(next_observations) - - target_q_values = torch.min( - self.target_qf1(next_observations, new_next_actions), - self.target_qf2(next_observations, new_next_actions), - ) - - if self.config.backup_entropy: - target_q_values = target_q_values - alpha * next_log_pi - - q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values - qf1_loss = F.mse_loss(q1_pred, q_target.detach()) - qf2_loss = F.mse_loss(q2_pred, q_target.detach()) - qf_loss = qf1_loss + qf2_loss - - if self.config.use_automatic_entropy_tuning: - self.alpha_optimizer.zero_grad() - alpha_loss.backward() - self.alpha_optimizer.step() - - self.policy_optimizer.zero_grad() - policy_loss.backward() - self.policy_optimizer.step() - - self.qf_optimizer.zero_grad() - qf_loss.backward() - self.qf_optimizer.step() - - if self.total_steps % self.config.target_update_period == 0: - self.update_target_network( - self.config.soft_target_update_rate - ) - - metrics_to_return = dict( - log_pi=log_pi.mean().item(), - policy_loss=policy_loss.item(), - guided_policy_loss=guided_loss.item(), - initial_policy_loss=policy_loss.item()-guided_loss.item(), - qf1_loss=qf1_loss.item(), - qf2_loss=qf2_loss.item(), - alpha_loss=alpha_loss.item(), - alpha=alpha.item(), - beta_loss=beta_loss.item(), - beta=beta.item(), - average_qf1=q1_pred.mean().item(), - average_qf2=q2_pred.mean().item(), - average_target_q=target_q_values.mean().item(), - total_steps=self.total_steps, - ) - - return metrics_to_return - - def torch_to_device(self, device): - for module in self.modules: - module.to(device) - - def get_action(self, - env, - observation, - deterministic=False, - add_local_information=False): - """ - The PolicyGuidedAgent always picks the agent generated by the policy. - """ - action = self.sampler_policy( - np.expand_dims(observation, 0), deterministic=deterministic - )[0, :] - if add_local_information: - use_local = self.use_local.get_use_local(env, - observation) - expert_action = self.local_expert.get_action(observation, - init_action=action, - env=env) - return action, use_local, expert_action - return action - - @property - def modules(self): - modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] - if self.config.use_automatic_entropy_tuning: - modules.append(self.log_alpha) - return modules - - @property - def total_steps(self): - return self._total_steps +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional, Tuple, Any, Dict, List, Union +from ml_collections import ConfigDict +import numpy as np +import torch +import torch.optim as optim +import torch.nn.functional as F + +from agents.common.model import Scalar, soft_target_update, SamplerPolicy + + +class PIG(object): + """ + Policy Improvement Guided (PAG) algorithm implementation. + + Parameters: + ----------- + config: dict + Configuration parameters for SAC. + policy: torch.nn.Module + The policy network. + sampler_policy: SamplerPolicy + The sampler policy network. + qf1: torch.nn.Module + The first critic network. + qf2: torch.nn.Module + The second critic network. + target_qf1: torch.nn.Module + The target network for the first critic. + target_qf2: torch.nn.Module + The target network for the second critic. + use_local: float + Float confidence function indicating whether to trust local policies. + local_expert: Any + Local expert. + beta: float + Strength of regularization + """ + + @staticmethod + def get_default_config(updates: Optional[Dict] = None) -> ConfigDict: + """ + Get the default configuration for PIG. + + Parameters: + ----------- + updates: dict, optional + Optional dictionary to update default configuration. + + Returns: + -------- + ConfigDict + Default configuration for SAC. + """ + config = ConfigDict() + config.discount = 0.99 + config.reward_scale = 1.0 + config.alpha_multiplier = 1.0 + config.use_automatic_entropy_tuning = True + config.backup_entropy = True + config.target_entropy = 0.0 + config.policy_lr = 3e-4 + config.qf_lr = 3e-4 + config.optimizer_type = 'adam' + config.soft_target_update_rate = 5e-3 + config.target_update_period = 1 + config.use_automatic_beta_tuning = False + config.target_beta = 0.0 + + if updates is not None: + config.update(ConfigDict(updates).copy_and_resolve_references()) + return config + + def __init__(self, + config: Dict, + policy: torch.nn.Module, + sampler_policy: SamplerPolicy, + qf1: torch.nn.Module, + qf2: torch.nn.Module, + target_qf1: torch.nn.Module, + target_qf2: torch.nn.Module, + use_local: float, + local_expert: Any, + beta: float): + self.config = PIG.get_default_config(config) + self.policy = policy + self.sampler_policy = sampler_policy + self.qf1 = qf1 + self.qf2 = qf2 + self.target_qf1 = target_qf1 + self.target_qf2 = target_qf2 + + # hyper parameter + self.use_local = use_local + self.local_expert = local_expert + self.beta = beta + self.training = True + + optimizer_class = { + 'adam': optim.Adam, + 'sgd': optim.SGD, + }[self.config.optimizer_type] + + self.policy_optimizer = optimizer_class( + self.policy.parameters(), self.config.policy_lr, + ) + self.qf_optimizer = optimizer_class( + list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr + ) + + if self.config.use_automatic_entropy_tuning: + self.log_alpha = Scalar(0.0) + self.alpha_optimizer = optimizer_class( + self.log_alpha.parameters(), + lr=self.config.policy_lr, + ) + else: + self.log_alpha = None + + if self.config.use_automatic_beta_tuning: + self.log_beta = Scalar(0.0) + self.beta_optimizer = optimizer_class( + self.log_beta.parameters(), + lr=self.config.policy_lr, + ) + else: + self.log_beta = None + + self.update_target_network(1.0) + self._total_steps = 0 + + def update_target_network(self, soft_target_update_rate: float) -> None: + """ + Update the target networks with soft target updates. + + Parameters: + ----------- + soft_target_update_rate: float + Rate of soft target network updates. + """ + soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) + soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) + + def train(self, batch: Dict[str, Any], batch_success: Optional[Dict[str, torch.Tensor]] = None) -> Dict[ + str, Any]: + """ + Train the PIG agent on a batch of experiences. + + Parameters: + ----------- + batch: dict + A dictionary containing the the transitions. + batch_success: dict, optional + A dictionary containing the the transitions. + + Returns: + -------- + dict + A dictionary containing training metrics. + """ + self._total_steps += 1 + + observations = batch['observations'] + actions = batch['actions'] + rewards = batch['rewards'] + next_observations = batch['next_observations'] + dones = batch['dones'] + + # retrieve local experts information + lambda_s_current = batch['use_local_current'] + # lambda_s_next = batch['use_local_next'] # no need + expert_actions = batch['expert_actions'] + # next_expert_actions = batch['next_expert_actions'] # no need + + new_actions, log_pi = self.policy(observations) + + if self.config.use_automatic_entropy_tuning: + alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() + alpha = self.log_alpha().exp() * self.config.alpha_multiplier + else: + alpha_loss = observations.new_tensor(0.0) + alpha = observations.new_tensor(self.config.alpha_multiplier) + + # It may be possible to tune beta according to a specific loss + if self.config.use_automatic_beta_tuning: + beta_loss = 1 + beta = self.log_beta().exp() + else: + beta_loss = observations.new_tensor(0.0) + beta = observations.new_tensor(self.beta) + + """ Policy loss """ + q_new_actions = torch.min( + self.qf1(observations, new_actions), + self.qf2(observations, new_actions), + ) + policy_loss = (alpha*log_pi - q_new_actions).mean() + + # PolicyGuided learning: loss is minus likelihood of expert action + guided_loss = beta * -(lambda_s_current * self.policy.log_prob(observations, expert_actions)).mean() + + policy_loss = policy_loss + guided_loss + + """ Q function loss """ + q1_pred = self.qf1(observations, actions) + q2_pred = self.qf2(observations, actions) + + with torch.no_grad(): + new_next_actions, next_log_pi = self.policy(next_observations) + + target_q_values = torch.min( + self.target_qf1(next_observations, new_next_actions), + self.target_qf2(next_observations, new_next_actions), + ) + + if self.config.backup_entropy: + target_q_values = target_q_values - alpha * next_log_pi + + q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values + qf1_loss = F.mse_loss(q1_pred, q_target.detach()) + qf2_loss = F.mse_loss(q2_pred, q_target.detach()) + qf_loss = qf1_loss + qf2_loss + + if self.config.use_automatic_entropy_tuning: + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + self.qf_optimizer.zero_grad() + qf_loss.backward() + self.qf_optimizer.step() + + if self.total_steps % self.config.target_update_period == 0: + self.update_target_network( + self.config.soft_target_update_rate + ) + + metrics_to_return = dict( + log_pi=log_pi.mean().item(), + policy_loss=policy_loss.item(), + guided_policy_loss=guided_loss.item(), + initial_policy_loss=policy_loss.item()-guided_loss.item(), + qf1_loss=qf1_loss.item(), + qf2_loss=qf2_loss.item(), + alpha_loss=alpha_loss.item(), + alpha=alpha.item(), + beta_loss=beta_loss.item(), + beta=beta.item(), + average_qf1=q1_pred.mean().item(), + average_qf2=q2_pred.mean().item(), + average_target_q=target_q_values.mean().item(), + total_steps=self.total_steps, + ) + + return metrics_to_return + + def torch_to_device(self, device: torch.device) -> None: + """ + Move all modules to the specified device. + + Parameters: + ----------- + device: torch.device + The target device. + """ + for module in self.modules: + module.to(device) + + def get_action(self, + env: Any, + observation: np.ndarray, + deterministic: bool = False, + add_local_information: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, float, np.ndarray]]: + """ + Get an action from the policy. + + Parameters: + ----------- + env: Any + The environment. + observation: np.ndarray + The current observation. + deterministic: bool, optional + Whether to sample a deterministic action. + add_local_information: bool, optional + Whether to add local information. + + Returns: + -------- + Tuple[np.ndarray, float, np.ndarray] + The action, local information, and expert action. + """ + action = self.sampler_policy( + np.expand_dims(observation, 0), deterministic=deterministic + )[0, :] + if add_local_information: + use_local = self.use_local.get_use_local(env, + observation) + expert_action = self.local_expert.get_action(observation, + init_action=action, + env=env) + return action, use_local, expert_action + return action + + @property + def modules(self) -> List[torch.nn.Module]: + """ + Get a list of modules. + + Returns: + -------- + List[nn.Module] + The list of modules including policy, q-functions, and optional log_alpha. + """ + modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] + if self.config.use_automatic_entropy_tuning: + modules.append(self.log_alpha) + return modules + + @property + def total_steps(self) -> int: + """ + Get the total number of steps taken. + + Returns: + -------- + int + The total number of steps. + """ + return self._total_steps diff --git a/RLLG/agents/algos/sac.py b/RLLG/agents/algos/sac.py index 3aa94d11..393c45aa 100644 --- a/RLLG/agents/algos/sac.py +++ b/RLLG/agents/algos/sac.py @@ -1,184 +1,289 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -from ml_collections import ConfigDict -import numpy as np -import torch -import torch.optim as optim -import torch.nn.functional as F - -from agents.common.model import Scalar, soft_target_update - - -class SAC(object): - - @staticmethod - def get_default_config(updates=None): - config = ConfigDict() - config.discount = 0.99 - config.reward_scale = 1.0 - config.alpha_multiplier = 1.0 - config.use_automatic_entropy_tuning = True - config.backup_entropy = True - config.target_entropy = 0.0 - config.policy_lr = 3e-4 - config.qf_lr = 3e-4 - config.optimizer_type = 'adam' - config.soft_target_update_rate = 5e-3 - config.target_update_period = 1 - - if updates is not None: - config.update(ConfigDict(updates).copy_and_resolve_references()) - return config - - def __init__(self, config, policy, sampler_policy, qf1, qf2, target_qf1, target_qf2): - self.config = SAC.get_default_config(config) - self.policy = policy - self.sampler_policy = sampler_policy - self.qf1 = qf1 - self.qf2 = qf2 - self.target_qf1 = target_qf1 - self.target_qf2 = target_qf2 - - optimizer_class = { - 'adam': optim.Adam, - 'sgd': optim.SGD, - }[self.config.optimizer_type] - - self.policy_optimizer = optimizer_class( - self.policy.parameters(), self.config.policy_lr, - ) - self.qf_optimizer = optimizer_class( - list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr - ) - - if self.config.use_automatic_entropy_tuning: - self.log_alpha = Scalar(0.0) - self.alpha_optimizer = optimizer_class( - self.log_alpha.parameters(), - lr=self.config.policy_lr, - ) - else: - self.log_alpha = None - - self.update_target_network(1.0) - self._total_steps = 0 - - def update_target_network(self, soft_target_update_rate): - soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) - soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) - - def train(self, batch, batch_success=None): - self._total_steps += 1 - - # classic obs - observations = batch['observations'] - actions = batch['actions'] - rewards = batch['rewards'] - next_observations = batch['next_observations'] - dones = batch['dones'] - - new_actions, log_pi = self.policy(observations) - - if self.config.use_automatic_entropy_tuning: - alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() - alpha = self.log_alpha().exp() * self.config.alpha_multiplier - else: - alpha_loss = observations.new_tensor(0.0) - alpha = observations.new_tensor(self.config.alpha_multiplier) - - """ Policy loss """ - q_new_actions = torch.min( - self.qf1(observations, new_actions), - self.qf2(observations, new_actions), - ) - policy_loss = (alpha*log_pi - q_new_actions).mean() - - """ Q function loss """ - q1_pred = self.qf1(observations, actions) - q2_pred = self.qf2(observations, actions) - - with torch.no_grad(): - new_next_actions, next_log_pi = self.policy(next_observations) - - target_q_values = torch.min( - self.target_qf1(next_observations, new_next_actions), - self.target_qf2(next_observations, new_next_actions), - ) - - if self.config.backup_entropy: - target_q_values = target_q_values - alpha * next_log_pi - - q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values - qf1_loss = F.mse_loss(q1_pred, q_target.detach()) - qf2_loss = F.mse_loss(q2_pred, q_target.detach()) - qf_loss = qf1_loss + qf2_loss - - if self.config.use_automatic_entropy_tuning: - self.alpha_optimizer.zero_grad() - alpha_loss.backward() - self.alpha_optimizer.step() - - self.policy_optimizer.zero_grad() - policy_loss.backward() - self.policy_optimizer.step() - - self.qf_optimizer.zero_grad() - qf_loss.backward() - self.qf_optimizer.step() - - if self.total_steps % self.config.target_update_period == 0: - self.update_target_network( - self.config.soft_target_update_rate - ) - - metrics_to_return = dict( - log_pi=log_pi.mean().item(), - policy_loss=policy_loss.item(), - qf1_loss=qf1_loss.item(), - qf2_loss=qf2_loss.item(), - alpha_loss=alpha_loss.item(), - alpha=alpha.item(), - average_qf1=q1_pred.mean().item(), - average_qf2=q2_pred.mean().item(), - average_target_q=target_q_values.mean().item(), - total_steps=self.total_steps, - ) - - return metrics_to_return - - - def torch_to_device(self, device): - for module in self.modules: - module.to(device) - - def get_action(self, - env, - observation, - deterministic=False, - add_local_information=False): - action = self.sampler_policy( - np.expand_dims(observation, 0), deterministic=deterministic - )[0, :] - if add_local_information: - return action, 0, np.zeros(action.shape) - return action - - @property - def modules(self): - modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] - if self.config.use_automatic_entropy_tuning: - modules.append(self.log_alpha) - return modules - - @property - def total_steps(self): - return self._total_steps +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple, Any, Dict, List, Union +from ml_collections import ConfigDict +import numpy as np +import torch +import torch.optim as optim +import torch.nn.functional as F +from agents.common.model import Scalar, soft_target_update, SamplerPolicy + + +class SAC(object): + """ + Soft Actor-Critic (SAC) algorithm implementation. + + Parameters: + ----------- + config: dict + Configuration parameters for SAC. + policy: torch.nn.Module + The policy network. + sampler_policy: SamplerPolicy + The sampler policy network. + qf1: torch.nn.Module + The first critic network. + qf2: torch.nn.Module + The second critic network. + target_qf1: torch.nn.Module + The target network for the first critic. + target_qf2: torch.nn.Module + The target network for the second critic. + """ + + @staticmethod + def get_default_config(updates: Optional[Dict] = None) -> ConfigDict: + """ + Get the default configuration for SAC. + + Parameters: + ----------- + updates: dict, optional + Optional dictionary to update default configuration. + + Returns: + -------- + ConfigDict + Default configuration for SAC. + """ + config = ConfigDict() + config.discount = 0.99 + config.reward_scale = 1.0 + config.alpha_multiplier = 1.0 + config.use_automatic_entropy_tuning = True + config.backup_entropy = True + config.target_entropy = 0.0 + config.policy_lr = 3e-4 + config.qf_lr = 3e-4 + config.optimizer_type = 'adam' + config.soft_target_update_rate = 5e-3 + config.target_update_period = 1 + + if updates is not None: + config.update(ConfigDict(updates).copy_and_resolve_references()) + return config + + def __init__(self, + config: Dict, + policy: torch.nn.Module, + sampler_policy: SamplerPolicy, + qf1: torch.nn.Module, + qf2: torch.nn.Module, + target_qf1: torch.nn.Module, + target_qf2: torch.nn.Module): + self.config = SAC.get_default_config(config) + self.policy = policy + self.sampler_policy = sampler_policy + self.qf1 = qf1 + self.qf2 = qf2 + self.target_qf1 = target_qf1 + self.target_qf2 = target_qf2 + + optimizer_class = { + 'adam': optim.Adam, + 'sgd': optim.SGD, + }[self.config.optimizer_type] + + self.policy_optimizer = optimizer_class( + self.policy.parameters(), self.config.policy_lr, + ) + self.qf_optimizer = optimizer_class( + list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr + ) + + if self.config.use_automatic_entropy_tuning: + self.log_alpha = Scalar(0.0) + self.alpha_optimizer = optimizer_class( + self.log_alpha.parameters(), + lr=self.config.policy_lr, + ) + else: + self.log_alpha = None + + self.update_target_network(1.0) + self._total_steps = 0 + + def update_target_network(self, soft_target_update_rate: float) -> None: + """ + Update the target networks with soft target updates. + + Parameters: + ----------- + soft_target_update_rate: float + Rate of soft target network updates. + """ + soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) + soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) + + def train(self, batch: Dict[str, Any], batch_success: Optional[Dict[str, torch.Tensor]] = None) -> Dict[ + str, Any]: + """ + Train the SAC (Soft Actor-Critic) agent on a batch of experiences. + + Parameters: + ----------- + batch: dict + A dictionary containing the the transitions. + batch_success: dict, optional + A dictionary containing the the transitions. + + Returns: + -------- + dict + A dictionary containing training metrics. + """ + self._total_steps += 1 + + # classic obs + observations = batch['observations'] + actions = batch['actions'] + rewards = batch['rewards'] + next_observations = batch['next_observations'] + dones = batch['dones'] + + new_actions, log_pi = self.policy(observations) + + if self.config.use_automatic_entropy_tuning: + alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() + alpha = self.log_alpha().exp() * self.config.alpha_multiplier + else: + alpha_loss = observations.new_tensor(0.0) + alpha = observations.new_tensor(self.config.alpha_multiplier) + + """ Policy loss """ + q_new_actions = torch.min( + self.qf1(observations, new_actions), + self.qf2(observations, new_actions), + ) + policy_loss = (alpha*log_pi - q_new_actions).mean() + + """ Q function loss """ + q1_pred = self.qf1(observations, actions) + q2_pred = self.qf2(observations, actions) + + with torch.no_grad(): + new_next_actions, next_log_pi = self.policy(next_observations) + + target_q_values = torch.min( + self.target_qf1(next_observations, new_next_actions), + self.target_qf2(next_observations, new_next_actions), + ) + + if self.config.backup_entropy: + target_q_values = target_q_values - alpha * next_log_pi + + q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values + qf1_loss = F.mse_loss(q1_pred, q_target.detach()) + qf2_loss = F.mse_loss(q2_pred, q_target.detach()) + qf_loss = qf1_loss + qf2_loss + + if self.config.use_automatic_entropy_tuning: + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + self.qf_optimizer.zero_grad() + qf_loss.backward() + self.qf_optimizer.step() + + if self.total_steps % self.config.target_update_period == 0: + self.update_target_network( + self.config.soft_target_update_rate + ) + + metrics_to_return = dict( + log_pi=log_pi.mean().item(), + policy_loss=policy_loss.item(), + qf1_loss=qf1_loss.item(), + qf2_loss=qf2_loss.item(), + alpha_loss=alpha_loss.item(), + alpha=alpha.item(), + average_qf1=q1_pred.mean().item(), + average_qf2=q2_pred.mean().item(), + average_target_q=target_q_values.mean().item(), + total_steps=self.total_steps, + ) + + return metrics_to_return + + def torch_to_device(self, device: torch.device) -> None: + """ + Move all modules to the specified device. + + Parameters: + ----------- + device: torch.device + The target device. + """ + for module in self.modules: + module.to(device) + + def get_action(self, + env: Any, + observation: np.ndarray, + deterministic: bool = False, + add_local_information: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, float, np.ndarray]]: + """ + Get an action from the policy. + + Parameters: + ----------- + env: Any + The environment. + observation: np.ndarray + The current observation. + deterministic: bool, optional + Whether to sample a deterministic action. + add_local_information: bool, optional + Whether to add local information. + + Returns: + -------- + Tuple[np.ndarray, float, np.ndarray] + The action, local information, and expert action. + """ + action = self.sampler_policy( + np.expand_dims(observation, 0), deterministic=deterministic + )[0, :] + if add_local_information: + return action, 0, np.zeros(action.shape) + return action + + @property + def modules(self) -> List[torch.nn.Module]: + """ + Get a list of modules. + + Returns: + -------- + List[nn.Module] + The list of modules including policy, q-functions, and optional log_alpha. + """ + modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] + if self.config.use_automatic_entropy_tuning: + modules.append(self.log_alpha) + return modules + + @property + def total_steps(self) -> int: + """ + Get the total number of steps taken. + + Returns: + -------- + int + The total number of steps. + """ + return self._total_steps diff --git a/RLLG/agents/algos/sag.py b/RLLG/agents/algos/sag.py index db74a526..68a3e112 100644 --- a/RLLG/agents/algos/sag.py +++ b/RLLG/agents/algos/sag.py @@ -1,226 +1,305 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -from ml_collections import ConfigDict -import numpy as np -import torch -import torch.optim as optim -import torch.nn.functional as F - -from agents.common.model import Scalar, soft_target_update - - -class SAG(object): - - @staticmethod - def get_default_config(updates=None): - config = ConfigDict() - config.discount = 0.99 - config.reward_scale = 1.0 - config.alpha_multiplier = 1.0 - config.use_automatic_entropy_tuning = True - config.backup_entropy = True - config.target_entropy = 0.0 - config.policy_lr = 3e-4 - config.qf_lr = 3e-4 - config.optimizer_type = 'adam' - config.soft_target_update_rate = 5e-3 - config.target_update_period = 1 - - if updates is not None: - config.update(ConfigDict(updates).copy_and_resolve_references()) - return config - - def __init__(self, config, policy, sampler_policy, qf1, qf2, target_qf1, target_qf2, - use_local, local_expert): - self.config = SAG.get_default_config(config) - self.policy = policy - self.sampler_policy = sampler_policy - self.qf1 = qf1 - self.qf2 = qf2 - self.target_qf1 = target_qf1 - self.target_qf2 = target_qf2 - - # hyperparams - self.use_local = use_local - self.local_expert = local_expert - - optimizer_class = { - 'adam': optim.Adam, - 'sgd': optim.SGD, - }[self.config.optimizer_type] - - self.policy_optimizer = optimizer_class( - self.policy.parameters(), self.config.policy_lr, - ) - self.qf_optimizer = optimizer_class( - list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr - ) - - if self.config.use_automatic_entropy_tuning: - self.log_alpha = Scalar(0.0) - self.alpha_optimizer = optimizer_class( - self.log_alpha.parameters(), - lr=self.config.policy_lr, - ) - else: - self.log_alpha = None - - self.update_target_network(1.0) - self._total_steps = 0 - - def update_target_network(self, soft_target_update_rate): - soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) - soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) - - def train(self, batch, batch_success=None): - self._total_steps += 1 - - # classic obs - observations = batch['observations'] - actions = batch['actions'] - rewards = batch['rewards'] - next_observations = batch['next_observations'] - dones = batch['dones'] - - # retrieve local experts information - lambda_s_current = batch['use_local_current'] - lambda_s_next = batch['use_local_next'] - expert_actions = batch['expert_actions'] - next_expert_actions = batch['next_expert_actions'] - - new_actions, log_pi = self.policy(observations) - - if self.config.use_automatic_entropy_tuning: - alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() - alpha = self.log_alpha().exp() * self.config.alpha_multiplier - else: - alpha_loss = observations.new_tensor(0.0) - alpha = observations.new_tensor(self.config.alpha_multiplier) - - """ Policy loss """ - if self.qf1.return_last_layer: - q_new_actions = torch.min( - self.qf1(observations, new_actions)[0], - self.qf2(observations, new_actions)[0], - ) - else: - q_new_actions = torch.min( - self.qf1(observations, new_actions), - self.qf2(observations, new_actions), - ) - policy_loss = (alpha*log_pi - q_new_actions).mean() - - """ Q function loss """ - if self.qf1.return_last_layer: - q1_pred, features_q1 = self.qf1(observations, actions) - q2_pred, features_q2 = self.qf2(observations, actions) - else: - q1_pred = self.qf1(observations, actions) - q2_pred = self.qf2(observations, actions) - - with torch.no_grad(): - new_next_actions, next_log_pi = self.policy(next_observations) - - # get new next actions from local experts --> REMEMBER THE POLICY IS SWITCHED - # new_next_actions = (new_next_actions.T * (1 - lambda_s_next)).T + \ - # (next_expert_actions.T * lambda_s_next).T - next_log_pi = (1 - lambda_s_next) * next_log_pi - - expert_target_q_values = torch.min( - self.target_qf1(next_observations, next_expert_actions), - self.target_qf2(next_observations, next_expert_actions), - ) - classic_target_q_values = torch.min( - self.target_qf1(next_observations, new_next_actions), - self.target_qf2(next_observations, new_next_actions), - ) - target_q_values = lambda_s_next * expert_target_q_values + \ - (1 - lambda_s_next) * classic_target_q_values - - if self.config.backup_entropy: - target_q_values = target_q_values - alpha * next_log_pi - - q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values - qf1_loss = F.mse_loss(q1_pred, q_target.detach()) - qf2_loss = F.mse_loss(q2_pred, q_target.detach()) - qf_loss = qf1_loss + qf2_loss - - if self.config.use_automatic_entropy_tuning: - self.alpha_optimizer.zero_grad() - alpha_loss.backward() - self.alpha_optimizer.step() - - self.policy_optimizer.zero_grad() - policy_loss.backward() - self.policy_optimizer.step() - - self.qf_optimizer.zero_grad() - qf_loss.backward() - self.qf_optimizer.step() - - if self.total_steps % self.config.target_update_period == 0: - self.update_target_network( - self.config.soft_target_update_rate - ) - - metrics_to_return = dict( - log_pi=log_pi.mean().item(), - policy_loss=policy_loss.item(), - qf1_loss=qf1_loss.item(), - qf2_loss=qf2_loss.item(), - alpha_loss=alpha_loss.item(), - alpha=alpha.item(), - average_qf1=q1_pred.mean().item(), - average_qf2=q2_pred.mean().item(), - average_target_q=target_q_values.mean().item(), - total_steps=self.total_steps, - ) - - return metrics_to_return - - def torch_to_device(self, device): - for module in self.modules: - module.to(device) - - def get_action(self, - env, - observation, - deterministic=False, - add_local_information=False): - """ - In switched agent, the agent always picks the expert action if it is relevant. - """ - - action = self.sampler_policy( - np.expand_dims(observation, 0), deterministic=deterministic - )[0, :] - if add_local_information: - use_local = self.use_local.get_use_local(env, - observation) - expert_action = self.local_expert.get_action(observation, - init_action=action, - env=env) - if use_local: - return expert_action, use_local, expert_action - return action, use_local, expert_action - return action - - @property - def modules(self): - modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] - if self.config.use_automatic_entropy_tuning: - modules.append(self.log_alpha) - return modules - - @property - def total_steps(self): - return self._total_steps +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional, Tuple, Any, Dict, List, Union +from ml_collections import ConfigDict +import numpy as np +import torch +import torch.optim as optim +import torch.nn.functional as F + +from agents.common.model import Scalar, soft_target_update + + +class SAG(object): + + @staticmethod + def get_default_config(updates: Optional[Dict] = None) -> ConfigDict: + """ + Get the default configuration for SAG. + + Parameters: + ----------- + updates: dict, optional + Optional dictionary to update default configuration. + + Returns: + -------- + ConfigDict + Default configuration for SAC. + """ + config = ConfigDict() + config.discount = 0.99 + config.reward_scale = 1.0 + config.alpha_multiplier = 1.0 + config.use_automatic_entropy_tuning = True + config.backup_entropy = True + config.target_entropy = 0.0 + config.policy_lr = 3e-4 + config.qf_lr = 3e-4 + config.optimizer_type = 'adam' + config.soft_target_update_rate = 5e-3 + config.target_update_period = 1 + + if updates is not None: + config.update(ConfigDict(updates).copy_and_resolve_references()) + return config + + def __init__(self, config, policy, sampler_policy, qf1, qf2, target_qf1, target_qf2, + use_local, local_expert): + self.config = SAG.get_default_config(config) + self.policy = policy + self.sampler_policy = sampler_policy + self.qf1 = qf1 + self.qf2 = qf2 + self.target_qf1 = target_qf1 + self.target_qf2 = target_qf2 + + # hyperparams + self.use_local = use_local + self.local_expert = local_expert + + optimizer_class = { + 'adam': optim.Adam, + 'sgd': optim.SGD, + }[self.config.optimizer_type] + + self.policy_optimizer = optimizer_class( + self.policy.parameters(), self.config.policy_lr, + ) + self.qf_optimizer = optimizer_class( + list(self.qf1.parameters()) + list(self.qf2.parameters()), self.config.qf_lr + ) + + if self.config.use_automatic_entropy_tuning: + self.log_alpha = Scalar(0.0) + self.alpha_optimizer = optimizer_class( + self.log_alpha.parameters(), + lr=self.config.policy_lr, + ) + else: + self.log_alpha = None + + self.update_target_network(1.0) + self._total_steps = 0 + + def update_target_network(self, soft_target_update_rate: float) -> None: + """ + Update the target networks with soft target updates. + + Parameters: + ----------- + soft_target_update_rate: float + Rate of soft target network updates. + """ + soft_target_update(self.qf1, self.target_qf1, soft_target_update_rate) + soft_target_update(self.qf2, self.target_qf2, soft_target_update_rate) + + def train(self, batch: Dict[str, Any], batch_success: Optional[Dict[str, torch.Tensor]] = None) -> Dict[ + str, Any]: + """ + Train the SAG agent on a batch of experiences. + + Parameters: + ----------- + batch: dict + A dictionary containing the the transitions. + batch_success: dict, optional + A dictionary containing the the transitions. + + Returns: + -------- + dict + A dictionary containing training metrics. + """ + self._total_steps += 1 + + # classic obs + observations = batch['observations'] + actions = batch['actions'] + rewards = batch['rewards'] + next_observations = batch['next_observations'] + dones = batch['dones'] + + # retrieve local experts information + lambda_s_current = batch['use_local_current'] + lambda_s_next = batch['use_local_next'] + expert_actions = batch['expert_actions'] + next_expert_actions = batch['next_expert_actions'] + + new_actions, log_pi = self.policy(observations) + + if self.config.use_automatic_entropy_tuning: + alpha_loss = -(self.log_alpha() * (log_pi + self.config.target_entropy).detach()).mean() + alpha = self.log_alpha().exp() * self.config.alpha_multiplier + else: + alpha_loss = observations.new_tensor(0.0) + alpha = observations.new_tensor(self.config.alpha_multiplier) + + """ Policy loss """ + if self.qf1.return_last_layer: + q_new_actions = torch.min( + self.qf1(observations, new_actions)[0], + self.qf2(observations, new_actions)[0], + ) + else: + q_new_actions = torch.min( + self.qf1(observations, new_actions), + self.qf2(observations, new_actions), + ) + policy_loss = (alpha*log_pi - q_new_actions).mean() + + """ Q function loss """ + if self.qf1.return_last_layer: + q1_pred, features_q1 = self.qf1(observations, actions) + q2_pred, features_q2 = self.qf2(observations, actions) + else: + q1_pred = self.qf1(observations, actions) + q2_pred = self.qf2(observations, actions) + + with torch.no_grad(): + new_next_actions, next_log_pi = self.policy(next_observations) + + # get new next actions from local experts --> REMEMBER THE POLICY IS SWITCHED + # new_next_actions = (new_next_actions.T * (1 - lambda_s_next)).T + \ + # (next_expert_actions.T * lambda_s_next).T + next_log_pi = (1 - lambda_s_next) * next_log_pi + + expert_target_q_values = torch.min( + self.target_qf1(next_observations, next_expert_actions), + self.target_qf2(next_observations, next_expert_actions), + ) + classic_target_q_values = torch.min( + self.target_qf1(next_observations, new_next_actions), + self.target_qf2(next_observations, new_next_actions), + ) + target_q_values = lambda_s_next * expert_target_q_values + \ + (1 - lambda_s_next) * classic_target_q_values + + if self.config.backup_entropy: + target_q_values = target_q_values - alpha * next_log_pi + + q_target = self.config.reward_scale * rewards + (1. - dones) * self.config.discount * target_q_values + qf1_loss = F.mse_loss(q1_pred, q_target.detach()) + qf2_loss = F.mse_loss(q2_pred, q_target.detach()) + qf_loss = qf1_loss + qf2_loss + + if self.config.use_automatic_entropy_tuning: + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + self.qf_optimizer.zero_grad() + qf_loss.backward() + self.qf_optimizer.step() + + if self.total_steps % self.config.target_update_period == 0: + self.update_target_network( + self.config.soft_target_update_rate + ) + + metrics_to_return = dict( + log_pi=log_pi.mean().item(), + policy_loss=policy_loss.item(), + qf1_loss=qf1_loss.item(), + qf2_loss=qf2_loss.item(), + alpha_loss=alpha_loss.item(), + alpha=alpha.item(), + average_qf1=q1_pred.mean().item(), + average_qf2=q2_pred.mean().item(), + average_target_q=target_q_values.mean().item(), + total_steps=self.total_steps, + ) + + return metrics_to_return + + def torch_to_device(self, device: torch.device) -> None: + """ + Move all modules to the specified device. + + Parameters: + ----------- + device: torch.device + The target device. + """ + for module in self.modules: + module.to(device) + + def get_action(self, + env: Any, + observation: np.ndarray, + deterministic: bool = False, + add_local_information: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, float, np.ndarray]]: + """ + Get an action from the policy. + + Parameters: + ----------- + env: Any + The environment. + observation: np.ndarray + The current observation. + deterministic: bool, optional + Whether to sample a deterministic action. + add_local_information: bool, optional + Whether to add local information. + + Returns: + -------- + Tuple[np.ndarray, float, np.ndarray] + The action, local information, and expert action. + """ + + action = self.sampler_policy( + np.expand_dims(observation, 0), deterministic=deterministic + )[0, :] + if add_local_information: + use_local = self.use_local.get_use_local(env, + observation) + expert_action = self.local_expert.get_action(observation, + init_action=action, + env=env) + if use_local: + return expert_action, use_local, expert_action + return action, use_local, expert_action + return action + + @property + def modules(self) -> List[torch.nn.Module]: + """ + Get a list of modules. + + Returns: + -------- + List[nn.Module] + The list of modules including policy, q-functions, and optional log_alpha. + """ + modules = [self.policy, self.qf1, self.qf2, self.target_qf1, self.target_qf2] + if self.config.use_automatic_entropy_tuning: + modules.append(self.log_alpha) + return modules + + @property + def total_steps(self) -> int: + """ + Get the total number of steps taken. + + Returns: + -------- + int + The total number of steps. + """ + return self._total_steps + diff --git a/RLLG/agents/common/config.py b/RLLG/agents/common/config.py new file mode 100644 index 00000000..ab5b6ccb --- /dev/null +++ b/RLLG/agents/common/config.py @@ -0,0 +1,67 @@ +from typing import Any, Dict, List, Optional, Tuple + + +def process_glob_config(config: Dict[str, Any]) \ + -> Tuple[List[str], Dict[str, Any], Dict[str, Any], Dict[str, Any], Dict[str, Any], List[float]]: + """ + Process a global configuration dictionary and extract relevant information. + + Parameters: + ---------- + config : Dict[str, Any] + The global configuration dictionary. + + Returns: + ---------- + Tuple[List[str], Optional[Dict[str, Any]], Dict[str, Any], Dict[str, Any], Dict[str, Any], List[float]] + A tuple containing the extracted information: + - List of expert names. + - Dictionary for position tolerance (or None if not present). + - Dictionary for values for beta depending on the agent. + - Dictionary for values for delta depending on the agent. + - Dictionary for values for phi depending on the agent. + - List of decay parameters. + """ + expert_names = config['local_experts'] + del config['local_experts'] + dict_pos_tol = None + if 'pos_tol' in config: + dict_pos_tol = config['pos_tol'] + del config['pos_tol'] + dict_beta = config['beta'] + dict_delta = config['delta'] + dict_phi = config['phi'] + del config['beta'] + del config['delta'] + del config['phi'] + decay_parameter_list = config['decay_parameter'] + del config['decay_parameter'] + return expert_names, dict_pos_tol, dict_beta, dict_delta, dict_phi, decay_parameter_list + + +def process_config_per_agent(config: Dict[str, Any], + agent_name: str, + dict_beta: Dict[str, Any], + dict_delta: Dict[str, Any], + dict_phi: Dict[str, Any], + dict_pos_tol: Dict[str, Any]) -> None: + """ + Process the configuration dictionary to make it dependant on the agent + + Parameters: + ---------- + config : Dict[str, Any] + The global configuration dictionary to be updated + dict_pos_tol: Optional[Dict[str, Any]]) + Add pos_tol argument to config dictionary. It is only usefull for the safe cartpole environment + + Returns: + ---------- + None + The function does not return anything. + """ + if dict_pos_tol is not None: + config['pos_tol'] = dict_pos_tol[agent_name] + config['beta'] = dict_beta[agent_name] + config['delta'] = dict_delta[agent_name] + config['phi'] = dict_phi[agent_name] diff --git a/RLLG/agents/common/creation_utils.py b/RLLG/agents/common/creation_utils.py new file mode 100644 index 00000000..0e2af919 --- /dev/null +++ b/RLLG/agents/common/creation_utils.py @@ -0,0 +1,150 @@ +from typing import Type, Any, Dict, List, Optional, Tuple +from envs.creation import get_env_and_control +from agents.algos.sac import SAC +from agents.algos.sag import SAG +from agents.algos.pag import PAG +from agents.algos.pig import PIG +from agents.common.model import SamplerPolicy, ExpertSamplerPolicy +from envs.cartpole.confidence import LambdaS +import gym +import torch + + +dict_agents = { + 'SAC': SAC, + 'SAG': SAG, + 'PIG': PIG, + 'PAG': PAG, +} + + +def create_envs(cfg: Dict[str, Any]) -> Tuple[Type[gym.Env], Dict[str, Any], Type[gym.Env], Dict[str, Any]]: + """ + Create training and testing environments with associated local control dictionaries based on the provided configuration. + + Parameters: + ---------- + cfg : Dict[str, Any] + The configuration dictionary + + Returns: + ---------- + Tuple[Type[gym.Env], Dict[str, Any], Type[gym.Env], Dict[str, Any]] + A tuple containing the training and testing environments along with their respective local control dictionaries. + """ + limit_cart = None + reward_end = None + pos_tol = None + if 'limit_cart' in cfg: + limit_cart = cfg['limit_cart'] + if 'reward_end' in cfg: + reward_end = cfg['reward_end'] + if 'pos_tol' in cfg: + pos_tol = cfg['pos_tol'] + env_train, local_control_dict_train = get_env_and_control(name=cfg['env'], + orig_cwd=cfg['orig_cwd'], + device=cfg['device'], + limit_cart=limit_cart, + reward_end=reward_end, + pos_tol=pos_tol + ) + env_test, local_control_dict_test = get_env_and_control(name=cfg['env'], + orig_cwd=cfg['orig_cwd'], + device=cfg['device'], + limit_cart=limit_cart, + reward_end=reward_end, + pos_tol=pos_tol + ) + return env_train, local_control_dict_train, env_test, local_control_dict_test + + +def create_agent(cfg: Dict[str, Any], + agent_name: str, + policy: torch.nn.Module, + sampler_policy: SamplerPolicy, + qf1: torch.nn.Module, + qf2: torch.nn.Module, + target_qf1: torch.nn.Module, + target_qf2: torch.nn.Module, + lambda_s: Optional[LambdaS] = None, + local_expert: Optional[Any] = None, + parametrized_perturbation: Optional[Type[torch.nn.Module]] = None, + sampler_parametrized_perturbation: Optional[Type[ExpertSamplerPolicy]] = None) \ + -> Any: + """ + Create an instance of an RL agent based on the specified configuration and components. + + Parameters: + ---------- + cfg : Dict[str, Any] + The configuration dictionary. + agent_name : str + The name of the agent to be created. + policy : Type[torch.nn.Module] + The policy network. + sampler_policy : Type[SamplerPolicy] + The policy sampler. + qf1 : Type[torch.nn.Module] + The first critic network. + qf2 : Type[torch.nn.Module] + The second critic network. + target_qf1 : Type[torch.nn.Module] + The target network for the first critic. + target_qf2 : Type[torch.nn.Module] + The target network for the second critic. + lambda_s : Optional[Type[LambdaS]] + The lambda_s confidence class (optional). + local_expert : Optional[Type[Any]] + The local expert (optional, and can be under any form). + parametrized_perturbation : Optional[Type[torch.nn.Module]] + The parametrized perturbation network (optional). + sampler_parametrized_perturbation : Optional[Type[ExpertSamplerPolicy]] + The sampler for the parametrized perturbation network (optional). + + Returns: + ---------- + Any + An instance of the specified RL agent. + """ + if cfg['agent_name'] == 'SAC': + agent = dict_agents[agent_name](cfg, + policy, + sampler_policy, + qf1, + qf2, + target_qf1, + target_qf2) + elif cfg['agent_name'] == 'SAG': + agent = dict_agents[agent_name](cfg, + policy, + sampler_policy, + qf1, + qf2, + target_qf1, + target_qf2, + use_local=lambda_s, + local_expert=local_expert) + elif cfg['agent_name'] == 'PIG': + agent = dict_agents[agent_name](cfg, + policy, + sampler_policy, + qf1, + qf2, + target_qf1, + target_qf2, + use_local=lambda_s, + local_expert=local_expert, + beta=cfg['beta']) + else: + agent = dict_agents[agent_name](cfg, + policy, + sampler_policy, + qf1, + qf2, + target_qf1, + target_qf2, + use_local=lambda_s, + local_expert=local_expert, + parametrized_perturbation=parametrized_perturbation, + sampler_parametrized_perturbation=sampler_parametrized_perturbation) + return agent diff --git a/RLLG/agents/common/model.py b/RLLG/agents/common/model.py index dce82d22..717b8279 100644 --- a/RLLG/agents/common/model.py +++ b/RLLG/agents/common/model.py @@ -1,297 +1,653 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -import numpy as np -import torch -import torch.nn as nn -from torch.distributions import Normal -from torch.distributions.transformed_distribution import TransformedDistribution -from torch.distributions.transforms import TanhTransform - - -def extend_and_repeat(tensor, dim, repeat): - # Extend and repeast the tensor along dim axie and repeat it - ones_shape = [1 for _ in range(tensor.ndim + 1)] - ones_shape[dim] = repeat - return torch.unsqueeze(tensor, dim) * tensor.new_ones(ones_shape) - - -def soft_target_update(network, target_network, soft_target_update_rate): - target_network_params = {k: v for k, v in target_network.named_parameters()} - for k, v in network.named_parameters(): - target_network_params[k].data = ( - (1 - soft_target_update_rate) * target_network_params[k].data - + soft_target_update_rate * v.data - ) - - -class FullyConnectedNetwork(nn.Module): - - def __init__(self, input_dim, output_dim, arch='256-256', - activation="relu", return_last_layer=False): - super().__init__() - self.input_dim = input_dim - self.output_dim = output_dim - self.arch = arch - self.activation = activation - self.return_last_layer = return_last_layer - - d = input_dim - modules = [] - hidden_sizes = [int(h) for h in arch.split('-')] - - for hidden_size in hidden_sizes: - fc = nn.Linear(d, hidden_size) - modules.append(fc) - if self.activation == 'relu': - modules.append(nn.ReLU()) - elif self.activation == 'tanh': - modules.append(nn.Tanh()) - else: - raise NotImplementedError(f'activation is {self.activation}') - d = hidden_size - - last_fc = nn.Linear(d, output_dim) - - if self.return_last_layer: - self.network_but_last = nn.Sequential(*modules) - self.last_fc = last_fc - else: - modules.append(last_fc) - self.network = nn.Sequential(*modules) - - def forward(self, input_tensor): - if self.return_last_layer: - last_layer = self.network_but_last(input_tensor) - return self.last_fc(last_layer), last_layer.clone() - return self.network(input_tensor) - - -class ReparameterizedTanhGaussian(nn.Module): - - def __init__(self, log_std_min=-20.0, log_std_max=2.0, no_tanh=False): - super().__init__() - self.log_std_min = log_std_min - self.log_std_max = log_std_max - self.no_tanh = no_tanh - - def log_prob(self, mean, log_std, sample): - log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) - std = torch.exp(log_std) - if self.no_tanh: - action_distribution = Normal(mean, std) - else: - action_distribution = TransformedDistribution( - Normal(mean, std), TanhTransform(cache_size=1) - ) - return torch.sum(action_distribution.log_prob(sample), dim=-1) - - def forward(self, mean, log_std, deterministic=False): - log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) - std = torch.exp(log_std) - - if self.no_tanh: - action_distribution = Normal(mean, std) - else: - action_distribution = TransformedDistribution( - Normal(mean, std), TanhTransform(cache_size=1) - ) - - if deterministic: - action_sample = torch.tanh(mean) - else: - action_sample = action_distribution.rsample() - - log_prob = torch.sum( - action_distribution.log_prob(action_sample), dim=-1 - ) - - return action_sample, log_prob - - -class TanhGaussianPolicy(nn.Module): - - def __init__(self, observation_dim, action_dim, arch='256-256', - log_std_multiplier=1.0, log_std_offset=-1.0, no_tanh=False, - activation='relu'): - super().__init__() - self.observation_dim = observation_dim - self.action_dim = action_dim - self.arch = arch - self.no_tanh = no_tanh - - self.base_network = FullyConnectedNetwork( - observation_dim, 2 * action_dim, arch, - activation=activation - ) - self.log_std_multiplier = Scalar(log_std_multiplier) - self.log_std_offset = Scalar(log_std_offset) - self.tanh_gaussian = ReparameterizedTanhGaussian(no_tanh=no_tanh) - - def log_prob(self, observations, actions): - if actions.ndim == 3: - observations = extend_and_repeat(observations, 1, actions.shape[1]) - base_network_output = self.base_network(observations) - mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) - log_std = self.log_std_multiplier() * log_std + self.log_std_offset() - return self.tanh_gaussian.log_prob(mean, log_std, actions) - - def forward(self, observations, deterministic=False, repeat=None): - if repeat is not None: - observations = extend_and_repeat(observations, 1, repeat) - base_network_output = self.base_network(observations) - mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) - log_std = self.log_std_multiplier() * log_std + self.log_std_offset() - return self.tanh_gaussian(mean, log_std, deterministic) - - - -class ParametrizedPerturbationTanhGaussianPolicy(nn.Module): - - def __init__(self, - observation_dim, - action_dim, - arch='256-256', - log_std_multiplier=1.0, - log_std_offset=-1.0, - no_tanh=False, - activation='relu', - phi=0.5): - super().__init__() - self.observation_dim = observation_dim - self.action_dim = action_dim - self.arch = arch - self.no_tanh = no_tanh - self.phi = phi - - self.base_network = FullyConnectedNetwork( - observation_dim, 2 * action_dim, arch, - activation=activation - ) - self.log_std_multiplier = Scalar(log_std_multiplier) - self.log_std_offset = Scalar(log_std_offset) - self.tanh_gaussian = ReparameterizedTanhGaussian(no_tanh=no_tanh) - - def log_prob(self, observations, actions, expert_actions): - - if actions.ndim == 3: - observations = extend_and_repeat(observations, 1, actions.shape[1]) - base_network_output = self.base_network(observations) - mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) - log_std = self.log_std_multiplier() * log_std + self.log_std_offset() - - # get reversed actions to get the log prob of the expert parametrized policy - phi_actions = (actions - expert_actions) / self.phi - - return self.tanh_gaussian.log_prob(mean, log_std, phi_actions) - - def forward(self, observations, expert_actions, beta=0., deterministic=False, repeat=None): - if repeat is not None: - observations = extend_and_repeat(observations, 1, repeat) - base_network_output = self.base_network(observations) - mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) - log_std = self.log_std_multiplier() * log_std + self.log_std_offset() - actions, log_probs = self.tanh_gaussian(mean, log_std, deterministic) - return (expert_actions + self.phi * (1 - beta) * actions).clamp(-0.999, 0.999), log_probs - - -class SamplerPolicy(object): - - def __init__(self, policy, device, from_ext=False): - self.policy = policy - self.device = device - self.from_ext = from_ext - - def __call__(self, observations, deterministic=False): - with torch.no_grad(): - observations = torch.tensor( - observations, dtype=torch.float32, device=self.device - ) - actions, _ = self.policy(observations, deterministic) - actions = actions.cpu().numpy() - return np.clip(actions, a_min=-0.999, a_max=0.999) - - -class ExpertSamplerPolicy(object): - - def __init__(self, policy, device, from_ext=False): - self.policy = policy - self.device = device - self.from_ext = from_ext - - def __call__(self, observations, expert_actions, beta=1., deterministic=False): - with torch.no_grad(): - observations = torch.tensor( - observations, dtype=torch.float32, device=self.device - ) - expert_actions = torch.tensor( - expert_actions, dtype=torch.float32, device=self.device - ) - actions, _ = self.policy(observations, expert_actions, - beta=beta, - deterministic=deterministic) - actions = actions.cpu().numpy() - return np.clip(actions, a_min=-0.999, a_max=0.999) - - -class FullyConnectedQFunction(nn.Module): - - def __init__(self, observation_dim, action_dim, arch='256-256', - activation='relu', return_last_layer=False): - super().__init__() - self.observation_dim = observation_dim - self.action_dim = action_dim - self.arch = arch - self.return_last_layer = return_last_layer - self.network = FullyConnectedNetwork( - observation_dim + action_dim, 1, arch, activation=activation, - return_last_layer=return_last_layer - ) - - def forward(self, observations, actions): - if actions.ndim == 3 and observations.ndim == 2: - observations = extend_and_repeat(observations, 1, actions.shape[1]) - input_tensor = torch.cat([observations, actions], dim=-1) - if self.return_last_layer: - output, last_layer = self.network(input_tensor) - return torch.squeeze(output, dim=-1), last_layer - output = self.network(input_tensor) - return torch.squeeze(output, dim=-1) - - -class TD3Policy(nn.Module): - - def __init__(self, observation_dim, action_dim, arch='256-256'): - super(TD3Policy, self).__init__() - self.arch = arch - - self.base_network = FullyConnectedNetwork( - observation_dim, action_dim, arch - ) - - - def forward(self, observation, deterministic=False): - """ - Added the deterministic argument to be consitent with the code. - """ - a_init = self.base_network(observation) - return torch.tanh(a_init), None - - -class Scalar(nn.Module): - def __init__(self, init_value): - super().__init__() - self.constant = nn.Parameter( - torch.tensor(init_value, dtype=torch.float32) - ) - - def forward(self): - return self.constant +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Tuple, Optional, Union +import numpy as np +import torch +import torch.nn as nn +from torch.distributions import Normal +from torch.distributions.transformed_distribution import TransformedDistribution +from torch.distributions.transforms import TanhTransform + + +def extend_and_repeat(tensor: torch.Tensor, dim: int, repeat: int) -> torch.Tensor: + """ + Extend and repeat the tensor along the specified axis. + + Parameters: + ---------- + tensor : torch.Tensor + Input tensor. + dim : int + Dimension along which to extend and repeat. + repeat : int + Number of times to repeat the tensor. + + Returns: + ---------- + torch.Tensor + Extended and repeated tensor. + """ + # Extend and repeast the tensor along dim axie and repeat it + ones_shape = [1 for _ in range(tensor.ndim + 1)] + ones_shape[dim] = repeat + return torch.unsqueeze(tensor, dim) * tensor.new_ones(ones_shape) + + +def soft_target_update(network: nn.Module, target_network: nn.Module, soft_target_update_rate: float) -> None: + """ + Update the target network parameters using a soft update. + + Parameters: + ---------- + network : nn.Module + The source network. + target_network : nn.Module + The target network to be updated. + soft_target_update_rate : float + The soft update rate. + + Returns: + ---------- + None + """ + target_network_params = {k: v for k, v in target_network.named_parameters()} + for k, v in network.named_parameters(): + target_network_params[k].data = ( + (1 - soft_target_update_rate) * target_network_params[k].data + + soft_target_update_rate * v.data + ) + + +class FullyConnectedNetwork(nn.Module): + """ + Fully connected neural network module. + + Parameters: + ---------- + input_dim : int + Dimension of the input. + output_dim : int + Dimension of the output. + arch : str, optional + Architecture of the network (default is '256-256'). + activation : str, optional + Activation function (default is 'relu'). + return_last_layer : bool, optional + Whether to return only the last layer (default is False). + """ + + def __init__(self, input_dim: int, output_dim: int, arch: Optional[str] = '256-256', + activation: Optional[str] = "relu", return_last_layer: Optional[bool] = False): + super().__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.arch = arch + self.activation = activation + self.return_last_layer = return_last_layer + + d = input_dim + modules = [] + hidden_sizes = [int(h) for h in arch.split('-')] + + for hidden_size in hidden_sizes: + fc = nn.Linear(d, hidden_size) + modules.append(fc) + if self.activation == 'relu': + modules.append(nn.ReLU()) + elif self.activation == 'tanh': + modules.append(nn.Tanh()) + else: + raise NotImplementedError(f'activation is {self.activation}') + d = hidden_size + + last_fc = nn.Linear(d, output_dim) + + if self.return_last_layer: + self.network_but_last = nn.Sequential(*modules) + self.last_fc = last_fc + else: + modules.append(last_fc) + self.network = nn.Sequential(*modules) + + def forward(self, input_tensor: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Forward pass through the network. + + Parameters: + ---------- + input_tensor : torch.Tensor + Input tensor. + + Returns: + ---------- + torch.Tensor or Tuple[torch.Tensor, torch.Tensor] + The output of the network, and optionally, the output of the last layer. + """ + if self.return_last_layer: + last_layer = self.network_but_last(input_tensor) + return self.last_fc(last_layer), last_layer.clone() + return self.network(input_tensor) + + +class ReparameterizedTanhGaussian(nn.Module): + """ + Tanh Gaussian distribution with reparametrized trick. + + Parameters: + ---------- + log_std_min : Optional[float], optional + Minimum value for the log standard deviation (default is -20.0). + log_std_max : Optional[float], optional + Maximum value for the log standard deviation (default is 2.0). + no_tanh : Optional[bool], optional + Whether to skip applying tanh to the sampled actions (default is False). + """ + + def __init__(self, log_std_min: Optional[float] = -20.0, + log_std_max: Optional[float] = 2.0, + no_tanh: Optional[bool] = False): + super().__init__() + self.log_std_min = log_std_min + self.log_std_max = log_std_max + self.no_tanh = no_tanh + + def log_prob(self, mean: torch.Tensor, log_std: torch.Tensor, sample: torch.Tensor) -> torch.Tensor: + """ + Compute the log probability of a sample under the distribution. + + Parameters: + ---------- + mean : torch.Tensor + Mean of the distribution. + log_std : torch.Tensor + Log standard deviation of the distribution. + sample : torch.Tensor + Sample to compute the log probability for. + + Returns: + ---------- + torch.Tensor + Log probability of the sample. + """ + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + std = torch.exp(log_std) + if self.no_tanh: + action_distribution = Normal(mean, std) + else: + action_distribution = TransformedDistribution( + Normal(mean, std), TanhTransform(cache_size=1) + ) + return torch.sum(action_distribution.log_prob(sample), dim=-1) + + def forward(self, mean: torch.Tensor, log_std: torch.Tensor, deterministic: Optional[bool] = False) -> Tuple[ + torch.Tensor, torch.Tensor]: + """ + Generate a sample and compute the log probability. + + Parameters: + ---------- + mean : torch.Tensor + Mean of the distribution. + log_std : torch.Tensor + Log standard deviation of the distribution. + deterministic : bool, optional + Flag indicating whether to sample deterministically (default is False). + + Returns: + ---------- + Tuple[torch.Tensor, torch.Tensor] + Generated action sample and its log probability. + """ + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + std = torch.exp(log_std) + + if self.no_tanh: + action_distribution = Normal(mean, std) + else: + action_distribution = TransformedDistribution( + Normal(mean, std), TanhTransform(cache_size=1) + ) + + if deterministic: + action_sample = torch.tanh(mean) + else: + action_sample = action_distribution.rsample() + + log_prob = torch.sum( + action_distribution.log_prob(action_sample), dim=-1 + ) + + return action_sample, log_prob + + +class TanhGaussianPolicy(nn.Module): + """ + Policy module representing a Tanh Gaussian policy. + + Parameters: + ---------- + observation_dim : int + Dimensionality of the observation space. + action_dim : int + Dimensionality of the action space. + arch : str, optional + Architecture of the base network (default is '256-256'). + log_std_multiplier : float, optional + Multiplier for the log standard deviation (default is 1.0). + log_std_offset : float, optional + Offset for the log standard deviation (default is -1.0). + no_tanh : bool, optional + Whether to skip applying tanh to the sampled actions (default is False). + activation : str, optional + Activation function used in the base network (default is 'relu'). + """ + + def __init__(self, observation_dim: int, action_dim: int, arch: Optional[str] = '256-256', + log_std_multiplier: Optional[float] = 1.0, log_std_offset: Optional[float] = -1.0, + no_tanh: Optional[bool] = False, activation: Optional[str] = 'relu'): + super().__init__() + self.observation_dim = observation_dim + self.action_dim = action_dim + self.arch = arch + self.no_tanh = no_tanh + + self.base_network = FullyConnectedNetwork( + observation_dim, 2 * action_dim, arch, + activation=activation + ) + self.log_std_multiplier = Scalar(log_std_multiplier) + self.log_std_offset = Scalar(log_std_offset) + self.tanh_gaussian = ReparameterizedTanhGaussian(no_tanh=no_tanh) + + def log_prob(self, observations: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: + """ + Compute the log probability of a given set of actions. + + Parameters: + ---------- + observations : torch.Tensor + Observations to condition the policy on. + actions : torch.Tensor + Actions for which to compute the log probability. + + Returns: + ---------- + torch.Tensor + Log probability of the given actions. + """ + if actions.ndim == 3: + observations = extend_and_repeat(observations, 1, actions.shape[1]) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + return self.tanh_gaussian.log_prob(mean, log_std, actions) + + def forward(self, observations: torch.Tensor, deterministic: bool = False, repeat: Optional[int] = None) -> Tuple[ + torch.Tensor, torch.Tensor]: + """ + Generate a sample and compute the log probability. + + Parameters: + ---------- + observations : torch.Tensor + Observations to condition the policy on. + deterministic : bool, optional + Flag indicating whether to sample deterministically (default is False). + repeat : Optional[int], optional + Number of times to repeat the action sampling (default is None). + + Returns: + ---------- + Tuple[torch.Tensor, torch.Tensor] + Generated action sample and its log probability. + """ + if repeat is not None: + observations = extend_and_repeat(observations, 1, repeat) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + return self.tanh_gaussian(mean, log_std, deterministic) + + + +class ParametrizedPerturbationTanhGaussianPolicy(nn.Module): + """ + Policy module representing the parametrized perturbation Tanh Gaussian policy. + + Parameters: + ---------- + observation_dim : int + Dimensionality of the observation space. + action_dim : int + Dimensionality of the action space. + arch : str, optional + Architecture of the base network (default is '256-256'). + log_std_multiplier : float, optional + Multiplier for the log standard deviation (default is 1.0). + log_std_offset : float, optional + Offset for the log standard deviation (default is -1.0). + no_tanh : bool, optional + Whether to skip applying tanh to the sampled actions (default is False). + activation : str, optional + Activation function used in the base network (default is 'relu'). + phi : float, optional + Phi parameter for the perturbation (default is 0.5). + """ + + def __init__(self, + observation_dim: int, + action_dim: int, + arch: Optional[str] = '256-256', + log_std_multiplier: Optional[float] = 1.0, + log_std_offset: Optional[float] = -1.0, + no_tanh: Optional[bool] = False, + activation: Optional[str] = 'relu', + phi: Optional[float] = 0.5): + super().__init__() + self.observation_dim = observation_dim + self.action_dim = action_dim + self.arch = arch + self.no_tanh = no_tanh + self.phi = phi + + self.base_network = FullyConnectedNetwork( + observation_dim, 2 * action_dim, arch, + activation=activation + ) + self.log_std_multiplier = Scalar(log_std_multiplier) + self.log_std_offset = Scalar(log_std_offset) + self.tanh_gaussian = ReparameterizedTanhGaussian(no_tanh=no_tanh) + + def log_prob(self, observations: torch.Tensor, actions: torch.Tensor, expert_actions: torch.Tensor) -> torch.Tensor: + """ + Compute the log probability of a given set of actions with respect to expert actions. + + Parameters: + ---------- + observations : torch.Tensor + Observations to condition the policy on. + actions : torch.Tensor + Actions for which to compute the log probability. + expert_actions : torch.Tensor + Expert actions to condition the policy on. + + Returns: + ---------- + torch.Tensor + Log probability of the given actions with respect to expert actions. + """ + if actions.ndim == 3: + observations = extend_and_repeat(observations, 1, actions.shape[1]) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + + # get reversed actions to get the log prob of the expert parametrized policy + phi_actions = (actions - expert_actions) / self.phi + + return self.tanh_gaussian.log_prob(mean, log_std, phi_actions) + + def forward(self, observations: torch.Tensor, expert_actions: torch.Tensor, beta: float = 0., + deterministic: bool = False, repeat: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Generate a sample and compute the log probability with respect to expert actions. + + Parameters: + ---------- + observations : torch.Tensor + Observations to condition the policy on. + expert_actions : torch.Tensor + Expert actions to condition the policy on. + beta : float, optional + Beta parameter for the perturbation (default is 0.). + deterministic : bool, optional + Flag indicating whether to sample deterministically (default is False). + repeat : Optional[int], optional + Number of times to repeat the action sampling (default is None). + + Returns: + ---------- + Tuple[torch.Tensor, torch.Tensor] + Generated action sample and its log probability. + """ + if repeat is not None: + observations = extend_and_repeat(observations, 1, repeat) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + actions, log_probs = self.tanh_gaussian(mean, log_std, deterministic) + return (expert_actions + self.phi * (1 - beta) * actions).clamp(-0.999, 0.999), log_probs + + +class SamplerPolicy(object): + """ + Wrapper class for creating a callable policy for action sampling. + + Parameters: + ---------- + policy : nn.Module + Policy module used for action sampling. + device : torch.device + Device on which to perform the action sampling. + from_ext : bool, optional + Flag indicating whether the policy is from an external source (default is False). + """ + + def __init__(self, policy: nn.Module, device: torch.device, from_ext: bool = False): + self.policy = policy + self.device = device + self.from_ext = from_ext + + def __call__(self, observations: Union[torch.Tensor, np.ndarray], deterministic: bool = False) -> np.ndarray: + """ + Sample actions from the policy. + + Parameters: + ---------- + observations : Union[torch.Tensor, np.ndarray] + Observations to condition the policy on. + deterministic : bool, optional + Flag indicating whether to sample deterministically (default is False). + + Returns: + ---------- + np.ndarray + Sampled actions. + """ + with torch.no_grad(): + observations = torch.tensor( + observations, dtype=torch.float32, device=self.device + ) + actions, _ = self.policy(observations, deterministic) + actions = actions.cpu().numpy() + return np.clip(actions, a_min=-0.999, a_max=0.999) + + +class ExpertSamplerPolicy(object): + """ + Wrapper class for creating a callable policy for expert action sampling. + + Parameters: + ---------- + policy : nn.Module + Policy module used for expert action sampling. + device : torch.device + Device on which to perform the expert action sampling. + from_ext : bool, optional + Flag indicating whether the policy is from an external source (default is False). + """ + + def __init__(self, policy: nn.Module, device: torch.device, from_ext: bool = False): + self.policy = policy + self.device = device + self.from_ext = from_ext + + def __call__(self, observations: Union[torch.Tensor, np.ndarray], + expert_actions: Union[torch.Tensor, np.ndarray], beta: float = 1., + deterministic: bool = False) -> np.ndarray: + """ + Sample expert actions from the policy. + + Parameters: + ---------- + observations : Union[torch.Tensor, np.ndarray] + Observations to condition the expert policy on. + expert_actions : Union[torch.Tensor, np.ndarray] + Expert actions to condition the expert policy on. + beta : float, optional + Weighting factor for blending expert actions (default is 1.). + deterministic : bool, optional + Flag indicating whether to sample expert actions deterministically (default is False). + + Returns: + ---------- + np.ndarray + Sampled expert actions. + """ + with torch.no_grad(): + observations = torch.tensor( + observations, dtype=torch.float32, device=self.device + ) + expert_actions = torch.tensor( + expert_actions, dtype=torch.float32, device=self.device + ) + actions, _ = self.policy(observations, expert_actions, + beta=beta, + deterministic=deterministic) + actions = actions.cpu().numpy() + return np.clip(actions, a_min=-0.999, a_max=0.999) + + +class FullyConnectedQFunction(nn.Module): + """ + Fully connected Q-function neural network. + + Parameters: + ---------- + observation_dim : int + Dimension of the observation space. + action_dim : int + Dimension of the action space. + arch : str, optional + Architecture configuration for the fully connected layers (default is '256-256'). + activation : str, optional + Activation function to use in the hidden layers (default is 'relu'). + return_last_layer : bool, optional + Whether to return the activations of the last hidden layer (default is False). + """ + + def __init__(self, observation_dim: int, action_dim: int, arch: Optional[str] = '256-256', + activation: Optional[str] = 'relu', return_last_layer: Optional[bool] = False): + super().__init__() + self.observation_dim = observation_dim + self.action_dim = action_dim + self.arch = arch + self.return_last_layer = return_last_layer + self.network = FullyConnectedNetwork( + observation_dim + action_dim, 1, arch, activation=activation, + return_last_layer=return_last_layer + ) + + def forward(self, observations: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the Q-function. + + Parameters: + ---------- + observations : torch.Tensor + Input observations. + actions : torch.Tensor + Input actions. + + Returns: + ---------- + torch.Tensor + Q-values for the given observations and actions. + """ + if actions.ndim == 3 and observations.ndim == 2: + observations = extend_and_repeat(observations, 1, actions.shape[1]) + input_tensor = torch.cat([observations, actions], dim=-1) + if self.return_last_layer: + output, last_layer = self.network(input_tensor) + return torch.squeeze(output, dim=-1), last_layer + output = self.network(input_tensor) + return torch.squeeze(output, dim=-1) + + +class TD3Policy(nn.Module): + """ + Twin Delayed DDPG (TD3) policy network. + + Parameters: + ---------- + observation_dim : int + Dimension of the observation space. + action_dim : int + Dimension of the action space. + arch : str, optional + Architecture configuration for the fully connected layers (default is '256-256'). + """ + + def __init__(self, observation_dim: int, action_dim: int, arch: str = '256-256'): + super(TD3Policy, self).__init__() + self.arch = arch + + self.base_network = FullyConnectedNetwork( + observation_dim, action_dim, arch + ) + + def forward(self, observation: torch.Tensor, deterministic: Optional[bool] = False) -> Tuple[torch.Tensor, None]: + """ + Forward pass of the TD3 policy network. + + Parameters: + ---------- + observation : torch.Tensor + Input observation. + deterministic : bool, optional + Whether to use deterministic policy (default is False). Added it for code consistency. + + Returns: + ---------- + Tuple[torch.Tensor, None] + Tuple containing the action tensor and None (no auxiliary information). + """ + a_init = self.base_network(observation) + return torch.tanh(a_init), None + + +class Scalar(nn.Module): + """ + Scalar value represented as a learnable parameter. + + Parameters: + ---------- + init_value : float + Initial value for the scalar. + """ + + def __init__(self, init_value: float): + super().__init__() + self.constant = nn.Parameter( + torch.tensor(init_value, dtype=torch.float32) + ) + + def forward(self) -> torch.Tensor: + """ + Forward pass to retrieve the scalar value. + + Returns: + ---------- + torch.Tensor + Learnable scalar value. + """ + return self.constant diff --git a/RLLG/agents/common/replay_buffer.py b/RLLG/agents/common/replay_buffer.py index 4c5e1c6d..8e04be60 100644 --- a/RLLG/agents/common/replay_buffer.py +++ b/RLLG/agents/common/replay_buffer.py @@ -1,180 +1,403 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - - -import numpy as np -import torch - - -class ReplayBuffer(object): - def __init__(self, max_size, data=None, nb_local_experts=0): - self._max_size = max_size - self._next_idx = 0 - self._size = 0 - self._initialized = False - self._total_steps = 0 - self.nb_local_experts = nb_local_experts - - if data is not None: - if self._max_size < data['observations'].shape[0]: - self._max_size = data['observations'].shape[0] - self.add_batch(data) - - def __len__(self): - return self._size - - def _init_storage(self, observation_dim, action_dim): - self._observation_dim = observation_dim - self._action_dim = action_dim - self._observations = np.zeros((self._max_size, observation_dim), dtype=np.float32) - self._next_observations = np.zeros((self._max_size, observation_dim), dtype=np.float32) - self._actions = np.zeros((self._max_size, action_dim), dtype=np.float32) - self._rewards = np.zeros(self._max_size, dtype=np.float32) - self._dones = np.zeros(self._max_size, dtype=np.float32) - self._use_local_current = np.zeros(self._max_size, dtype=np.float32) - self._use_local_next = np.zeros(self._max_size, dtype=np.float32) - self._expert_actions = np.zeros((self._max_size, action_dim), dtype=np.float32) - self._next_expert_actions = np.zeros((self._max_size, action_dim), dtype=np.float32) - self._next_idx = 0 - self._size = 0 - self._initialized = True - - def add_sample(self, observation, action, reward, next_observation, done, - use_local_current, use_local_next, expert_actions, next_expert_actions): - if not self._initialized: - self._init_storage(observation.size, action.size) - - self._observations[self._next_idx, :] = np.array(observation, dtype=np.float32) - self._next_observations[self._next_idx, :] = np.array(next_observation, dtype=np.float32) - self._actions[self._next_idx, :] = np.array(action, dtype=np.float32) - self._rewards[self._next_idx] = reward - self._dones[self._next_idx] = float(done) - - # use locals - self._use_local_current[self._next_idx] = float(use_local_current) - self._use_local_next[self._next_idx] = float(use_local_next) - - # actions - self._expert_actions[self._next_idx] = np.array(expert_actions, dtype=np.float32) - self._next_expert_actions[self._next_idx] = np.array(next_expert_actions, dtype=np.float32) - - if self._size < self._max_size: - self._size += 1 - self._next_idx = (self._next_idx + 1) % self._max_size - self._total_steps += 1 - - def add_traj(self, observations, actions, rewards, next_observations, dones, - use_local_current, use_local_next, expert_actions, next_expert_actions): - for o, a, r, no, d, u_c, u_n, ea, nea in zip(observations, actions, rewards, next_observations, dones, - use_local_current, use_local_next, - expert_actions, next_expert_actions): - self.add_sample(o, a, r, no, d, u_c, u_n, ea, nea) - - def add_batch(self, batch): - self.add_traj( - batch['observations'], - batch['actions'], - batch['rewards'], - batch['next_observations'], - batch['dones'], - batch['use_local_current'], - batch['use_local_next'], - batch['expert_actions'], - batch['next_expert_actions'], - ) - - def sample(self, batch_size): - indices = np.random.randint(len(self), size=batch_size) - return self.select(indices) - - def select(self, indices): - # select expert if any - use_locals_current, use_locals_next = {}, {} - expert_actions, next_expert_actions = {}, {} - use_local_current = self._use_local_current[indices, ...] - use_local_next = self._use_local_next[indices, ...] - expert_actions = self._expert_actions[indices, ...] - next_expert_actions = self._next_expert_actions[indices, ...] - - return dict( - observations=self._observations[indices, ...], - actions=self._actions[indices, ...], - rewards=self._rewards[indices, ...], - next_observations=self._next_observations[indices, ...], - dones=self._dones[indices, ...], - use_local_current=use_local_current, - use_local_next=use_local_next, - expert_actions=expert_actions, - next_expert_actions=next_expert_actions - ) - - def generator(self, batch_size, n_batchs=None): - i = 0 - while n_batchs is None or i < n_batchs: - yield self.sample(batch_size) - i += 1 - - @property - def total_steps(self): - return self._total_steps - - @property - def data(self): - return dict( - observations=self._observations[:self._size, ...], - actions=self._actions[:self._size, ...], - rewards=self._rewards[:self._size, ...], - next_observations=self._next_observations[:self._size, ...], - dones=self._dones[:self._size, ...], - use_local_current=self._use_local_current[:self._size, ...], - use_local_next=self._use_local_next[:self._size, ...], - expert_actions=self._expert_actions[:self._size, ...], - next_expert_actions=self._next_expert_actions[:self._size, ...], - ) - - -def batch_to_torch(batch, device): - return { - k: torch.from_numpy(v).to(device=device, non_blocking=True) if type(v) is np.ndarray - else {nb: torch.from_numpy(v[nb]).to(device=device, non_blocking=True) for nb in range(len(v))} - for k, v in batch.items() - } - - -def subsample_batch(batch, size): - indices = np.random.randint(batch['observations'].shape[0], size=size) - - return dict( - observations=batch['observations'][indices, ...], - actions=batch['actions'][indices, ...], - rewards=batch['rewards'][indices, ...], - next_observations=batch['next_observations'][indices, ...], - dones=batch['dones'][indices, ...], - use_local_current=batch['use_local_current'][indices, ...], - use_local_next=batch['use_local_next'][indices, ...], - expert_actions=batch['expert_actions'][indices, ...], - next_expert_actions=batch['next_expert_actions'][indices, ...], - ) - - -def concatenate_batches(batches): - - return dict( - observations=np.concatenate([batch['observations'] for batch in batches], axis=0).astype(np.float32), - actions=np.concatenate([batch['actions'] for batch in batches], axis=0).astype(np.float32), - rewards=np.concatenate([batch['rewards'] for batch in batches], axis=0).astype(np.float32), - next_observations=np.concatenate([batch['next_observations'] for batch in batches], axis=0).astype(np.float32), - dones=np.concatenate([batch['dones'] for batch in batches], axis=0).astype(np.float32), - use_locals_current=np.concatenate([batch['use_locals_current'] for batch in batches], axis=0).astype(np.float32), - use_locals_next=np.concatenate([batch['use_locals_next'] for batch in batches], axis=0).astype(np.float32), - expert_actions=np.concatenate([batch['expert_actions'] for batch in batches], axis=0).astype(np.float32), - next_expert_actions=np.concatenate([batch['next_expert_actions'] for batch in batches], axis=0).astype(np.float32), - ) +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional, Dict, Union, List, Generator, Any +import numpy as np +import torch + + +class ReplayBuffer(object): + """ + Replay buffer for storing and sampling transitions. + + Parameters: + ---------- + max_size : int + Maximum size of the replay buffer. + data : dict, optional + Initial data to populate the replay buffer. + nb_local_experts : int, optional + Number of local experts (default is 0). + """ + + def __init__(self, max_size: int, data: Optional[Dict[str, np.ndarray]] = None, nb_local_experts: Optional[int] = 0): + self._max_size = max_size + self._next_idx = 0 + self._size = 0 + self._initialized = False + self._total_steps = 0 + self.nb_local_experts = nb_local_experts + + if data is not None: + if self._max_size < data['observations'].shape[0]: + self._max_size = data['observations'].shape[0] + self.add_batch(data) + + def __len__(self) -> int: + """ + Get the current size of the replay buffer. + + Returns: + ---------- + int + Current size of the replay buffer. + """ + return self._size + + def _init_storage(self, observation_dim: int, action_dim: int) -> None: + """ + Initialize the storage arrays. + + Parameters: + ---------- + observation_dim : int + Dimensionality of the observations. + action_dim : int + Dimensionality of the actions. + + Returns: + ---------- + None + """ + self._observation_dim = observation_dim + self._action_dim = action_dim + self._observations = np.zeros((self._max_size, observation_dim), dtype=np.float32) + self._next_observations = np.zeros((self._max_size, observation_dim), dtype=np.float32) + self._actions = np.zeros((self._max_size, action_dim), dtype=np.float32) + self._rewards = np.zeros(self._max_size, dtype=np.float32) + self._dones = np.zeros(self._max_size, dtype=np.float32) + self._use_local_current = np.zeros(self._max_size, dtype=np.float32) + self._use_local_next = np.zeros(self._max_size, dtype=np.float32) + self._expert_actions = np.zeros((self._max_size, action_dim), dtype=np.float32) + self._next_expert_actions = np.zeros((self._max_size, action_dim), dtype=np.float32) + self._next_idx = 0 + self._size = 0 + self._initialized = True + + def add_sample(self, + observation: np.ndarray, + action: np.ndarray, + reward: float, + next_observation: np.ndarray, + done: bool, + use_local_current: float, + use_local_next: float, + expert_actions: np.ndarray, + next_expert_actions: np.ndarray): + """ + Add a single transition to the replay buffer. + + Parameters: + ---------- + observation : np.ndarray + Observation array. + action : np.ndarray + Action array. + reward : float + Reward value. + next_observation : np.ndarray + Next observation array. + done : bool + Whether the episode is done. + use_local_current : float + Confidence function for local expert for the current action. + use_local_next : float + Confidence function for local expert for the next action. + expert_actions : np.ndarray + Expert actions array. + next_expert_actions : np.ndarray + Next expert actions array. + + Returns: + ---------- + None + """ + if not self._initialized: + self._init_storage(observation.size, action.size) + + self._observations[self._next_idx, :] = np.array(observation, dtype=np.float32) + self._next_observations[self._next_idx, :] = np.array(next_observation, dtype=np.float32) + self._actions[self._next_idx, :] = np.array(action, dtype=np.float32) + self._rewards[self._next_idx] = reward + self._dones[self._next_idx] = float(done) + + # use locals + self._use_local_current[self._next_idx] = float(use_local_current) + self._use_local_next[self._next_idx] = float(use_local_next) + + # actions + self._expert_actions[self._next_idx] = np.array(expert_actions, dtype=np.float32) + self._next_expert_actions[self._next_idx] = np.array(next_expert_actions, dtype=np.float32) + + if self._size < self._max_size: + self._size += 1 + self._next_idx = (self._next_idx + 1) % self._max_size + self._total_steps += 1 + + def add_traj(self, observations: np.ndarray, actions: np.ndarray, rewards: np.ndarray, + next_observations: np.ndarray, dones: np.ndarray, + use_local_current: np.ndarray, use_local_next: np.ndarray, + expert_actions: np.ndarray, next_expert_actions: np.ndarray): + """ + Add a trajectory to the replay buffer. + + Parameters: + ---------- + observations : np.ndarray + Array of observations. + actions : np.ndarray + Array of actions. + rewards : np.ndarray + Array of rewards. + next_observations : np.ndarray + Array of next observations. + dones : np.ndarray + Array of done flags. + use_local_current : np.ndarray + Array of flags for using local expert for the current action. + use_local_next : np.ndarray + Array of flags for using local expert for the next action. + expert_actions : np.ndarray + Array of expert actions. + next_expert_actions : np.ndarray + Array of next expert actions. + + Returns: + ---------- + None + """ + for o, a, r, no, d, u_c, u_n, ea, nea in zip(observations, actions, rewards, next_observations, dones, + use_local_current, use_local_next, + expert_actions, next_expert_actions): + self.add_sample(o, a, r, no, d, u_c, u_n, ea, nea) + + def add_batch(self, batch: Dict[str, np.ndarray]): + """ + Add a batch of data to the replay buffer. + + Parameters: + ---------- + batch : Dict[str, np.ndarray] + Dictionary containing arrays of observations, actions, rewards, next observations, + done flags, floats for the confidence function of the local expert for the current action, + floats for the confidence function of the local expert for the next action, expert actions, + and next expert actions. + + Returns: + ---------- + None + """ + self.add_traj( + batch['observations'], + batch['actions'], + batch['rewards'], + batch['next_observations'], + batch['dones'], + batch['use_local_current'], + batch['use_local_next'], + batch['expert_actions'], + batch['next_expert_actions'], + ) + + def sample(self, batch_size: int) -> Dict[str, np.ndarray]: + """ + Sample a batch of data from the replay buffer. + + Parameters: + ---------- + batch_size : int + The number of samples to be drawn. + + Returns: + ---------- + Dict[str, np.ndarray] + Dictionary containing arrays of observations, actions, rewards, next observations, + done flags, flags for using local expert for the current action, flags for using local + expert for the next action, expert actions, and next expert actions. + """ + indices = np.random.randint(len(self), size=batch_size) + return self.select(indices) + + def select(self, indices: np.ndarray) -> Dict[str, np.ndarray]: + """ + Select samples from the replay buffer based on the given indices. + + Parameters: + ---------- + indices : np.ndarray + Array of indices to select samples from the replay buffer. + + Returns: + ---------- + Dict[str, np.ndarray] + Dictionary containing arrays of observations, actions, rewards, next observations, + done flags, flags for using local expert for the current action, flags for using local + expert for the next action, expert actions, and next expert actions. + """ + use_local_current = self._use_local_current[indices, ...] + use_local_next = self._use_local_next[indices, ...] + expert_actions = self._expert_actions[indices, ...] + next_expert_actions = self._next_expert_actions[indices, ...] + + return dict( + observations=self._observations[indices, ...], + actions=self._actions[indices, ...], + rewards=self._rewards[indices, ...], + next_observations=self._next_observations[indices, ...], + dones=self._dones[indices, ...], + use_local_current=use_local_current, + use_local_next=use_local_next, + expert_actions=expert_actions, + next_expert_actions=next_expert_actions + ) + + def generator(self, batch_size: int, n_batchs: Optional[int] = None) -> Generator[Dict[str, Any], None, None]: + """ + Generator function that yields batches of samples from the replay buffer. + + Parameters: + ---------- + batch_size : int + Size of each batch. + n_batchs : int, optional + Number of batches to generate (default is None for an infinite generator). + + Yields: + ---------- + Dict[str, Any] + Dictionary containing arrays of observations, actions, rewards, next observations, + done flags, flags for using local expert for the current action, flags for using local + expert for the next action, expert actions, and next expert actions. + """ + i = 0 + while n_batchs is None or i < n_batchs: + yield self.sample(batch_size) + i += 1 + + @property + def total_steps(self) -> int: + """ + Property to get the total number of steps taken by the replay buffer. + + Returns: + ---------- + int + Total number of steps. + """ + return self._total_steps + + @property + def data(self) -> Dict[str, Any]: + """ + Property to get a dictionary containing arrays of observations, actions, rewards, next observations, + done flags, confidence function for using local expert for the current action, + confidence function for using local expert for the next action, expert actions, and next expert actions. + + Returns: + ---------- + Dict[str, Any] + Dictionary containing arrays of observations, actions, rewards, next observations, + done flags, confidence function for using local expert for the current action, + confidence function for using local expert for the next action, expert actions, and next expert actions. + """ + return dict( + observations=self._observations[:self._size, ...], + actions=self._actions[:self._size, ...], + rewards=self._rewards[:self._size, ...], + next_observations=self._next_observations[:self._size, ...], + dones=self._dones[:self._size, ...], + use_local_current=self._use_local_current[:self._size, ...], + use_local_next=self._use_local_next[:self._size, ...], + expert_actions=self._expert_actions[:self._size, ...], + next_expert_actions=self._next_expert_actions[:self._size, ...], + ) + + +def batch_to_torch(batch: Dict[str, Union[np.ndarray, Dict[int, np.ndarray]]], device: str) \ + -> Dict[str, Union[torch.Tensor, Dict[int, torch.Tensor]]]: + """ + Convert a batch from NumPy arrays to PyTorch tensors. + + Parameters: + ---------- + batch : Dict[str, Union[np.ndarray, Dict[int, np.ndarray]]] + Dictionary containing NumPy arrays or dictionaries of NumPy arrays. + device : str + The device to which the tensors should be moved. + + Returns: + ---------- + Dict[str, Union[torch.Tensor, Dict[int, torch.Tensor]]] + Dictionary containing PyTorch tensors or dictionaries of PyTorch tensors. + """ + return { + k: torch.from_numpy(v).to(device=device, non_blocking=True) if type(v) is np.ndarray + else {nb: torch.from_numpy(v[nb]).to(device=device, non_blocking=True) for nb in range(len(v))} + for k, v in batch.items() + } + + +def subsample_batch(batch: Dict[str, np.ndarray], size: int) -> Dict[str, np.ndarray]: + """ + Subsample a batch with the given size. + + Parameters: + ---------- + batch : Dict[str, np.ndarray] + Dictionary containing NumPy arrays. + size : int + The size of the subsampled batch. + + Returns: + ---------- + Dict[str, np.ndarray] + Subsampled batch. + """ + indices = np.random.randint(batch['observations'].shape[0], size=size) + + return dict( + observations=batch['observations'][indices, ...], + actions=batch['actions'][indices, ...], + rewards=batch['rewards'][indices, ...], + next_observations=batch['next_observations'][indices, ...], + dones=batch['dones'][indices, ...], + use_local_current=batch['use_local_current'][indices, ...], + use_local_next=batch['use_local_next'][indices, ...], + expert_actions=batch['expert_actions'][indices, ...], + next_expert_actions=batch['next_expert_actions'][indices, ...], + ) + + +def concatenate_batches(batches: List[Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]: + """ + Concatenate multiple batches into a single batch. + + Parameters: + ---------- + batches : List[Dict[str, np.ndarray]] + List of dictionaries, each containing NumPy arrays. + + Returns: + ---------- + Dict[str, np.ndarray] + Concatenated batch. + """ + return dict( + observations=np.concatenate([batch['observations'] for batch in batches], axis=0).astype(np.float32), + actions=np.concatenate([batch['actions'] for batch in batches], axis=0).astype(np.float32), + rewards=np.concatenate([batch['rewards'] for batch in batches], axis=0).astype(np.float32), + next_observations=np.concatenate([batch['next_observations'] for batch in batches], axis=0).astype(np.float32), + dones=np.concatenate([batch['dones'] for batch in batches], axis=0).astype(np.float32), + use_locals_current=np.concatenate([batch['use_locals_current'] for batch in batches], axis=0).astype(np.float32), + use_locals_next=np.concatenate([batch['use_locals_next'] for batch in batches], axis=0).astype(np.float32), + expert_actions=np.concatenate([batch['expert_actions'] for batch in batches], axis=0).astype(np.float32), + next_expert_actions=np.concatenate([batch['next_expert_actions'] for batch in batches], axis=0).astype(np.float32), + ) diff --git a/RLLG/agents/common/sampler.py b/RLLG/agents/common/sampler.py index 153cccb1..4af0d65a 100644 --- a/RLLG/agents/common/sampler.py +++ b/RLLG/agents/common/sampler.py @@ -1,193 +1,273 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - - -import numpy as np - - -class StepSampler(object): - - def __init__(self, - env, - max_traj_length=1000): - self.max_traj_length = max_traj_length - self._env = env - self._traj_steps = 0 - self._current_observation = self.env.reset() - - def sample(self, agent, n_steps, deterministic=False, replay_buffer=None): - # general observations - observations = [] - actions = [] - rewards = [] - next_observations = [] - dones = [] - list_use_local_current = [] - list_use_local_next = [] - failures = [] - - for n_ in range(n_steps): - - self._traj_steps += 1 - observation = self._current_observation - - # get action and local information - if n_ == 0: - action, use_local_current, expert_action = agent.get_action(self.env, - observation, - deterministic=deterministic, - add_local_information=True) - else: - expert_action = next_expert_action.copy() - use_local_current = use_local_next - action = next_action.copy() - - # Apply next action and save transition - next_observation, reward, done, info = self.env.step(action) - observations.append(observation) - actions.append(action) - rewards.append(reward) - dones.append(done) - next_observations.append(next_observation) - if reward <= -500: - failures.append(1) - else: - failures.append(0) - - # Choose action according to local policies to record for both obs and next_obs - next_action, use_local_next, next_expert_action = agent.get_action(self.env, - next_observation, - deterministic=deterministic, - add_local_information=True) - - # add local information - list_use_local_current.append(use_local_current) - list_use_local_next.append(use_local_next) - - if replay_buffer is not None: - replay_buffer.add_sample( - observation, - action, - reward, - next_observation, - done, - use_local_current, - use_local_next, - expert_action, - next_expert_action - ) - - self._current_observation = next_observation - - if done or self._traj_steps >= self.max_traj_length: - self._current_observation = self.env.reset() - self._traj_steps = 0 - - metrics_to_return = dict( - observations=np.array(observations, dtype=np.float32), - actions=np.array(actions, dtype=np.float32), - rewards=np.array(rewards, dtype=np.float32), - next_observations=np.array(next_observations, dtype=np.float32), - dones=np.array(dones, dtype=np.float32), - list_use_local_current=np.array(list_use_local_current, dtype=np.float32), - list_use_local_next=np.array(list_use_local_next, dtype=np.float32), - failures=np.array(failures, dtype=np.float32), - ) - - return metrics_to_return - - @property - def env(self): - return self._env - - -class TrajSampler(object): - - def __init__(self, - env, - max_traj_length=1000): - self.max_traj_length = max_traj_length - self._env = env - - def sample(self, agent, n_trajs, deterministic=False, replay_buffer=None, replay_buffer_success=None): - - trajs = [] - - for _ in range(n_trajs): - observations = [] - actions = [] - rewards = [] - next_observations = [] - dones = [] - failures = [] - list_use_local_current = [] - list_use_local_next = [] - - observation = self.env.reset() - - for n_ in range(self.max_traj_length): - - # get action and local information - if n_ == 0: - action, use_local_current, expert_action = agent.get_action(self.env, - observation, - deterministic=deterministic, - add_local_information=True) - else: - expert_action = next_expert_action.copy() - use_local_current = use_local_next - action = next_action.copy() - - # Apply next action and save transition - next_observation, reward, done, info = self.env.step(action) - observations.append(observation) - actions.append(action) - rewards.append(reward) - dones.append(done) - next_observations.append(next_observation) - if reward <= -500: - failures.append(1) - else: - failures.append(0) - - # Choose action according to local policies to record for both obs and next_obs - next_action, use_local_next, next_expert_action = agent.get_action(self.env, - next_observation, - deterministic=deterministic, - add_local_information=True) - - # add local information - list_use_local_current.append(use_local_current) - list_use_local_next.append(use_local_next) - - observation = next_observation - - if done: - break - - metrics_to_return = dict( - observations=np.array(observations, dtype=np.float32), - actions=np.array(actions, dtype=np.float32), - rewards=np.array(rewards, dtype=np.float32), - next_observations=np.array(next_observations, dtype=np.float32), - dones=np.array(dones, dtype=np.float32), - list_use_local_current=np.array(list_use_local_current, dtype=np.float32), - list_use_local_next=np.array(list_use_local_next, dtype=np.float32), - failures=np.array(failures, dtype=np.float32), - ) - - trajs.append(metrics_to_return) - - return trajs - - @property - def env(self): - return self._env +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Any, Optional, Dict +from agents.common.replay_buffer import ReplayBuffer +import numpy as np + + +class StepSampler(object): + """ + StepSampler for collecting time-steps from an environment. + + Parameters: + ---------- + env : Any + The environment. + max_traj_length : int, optional + Maximum length of a trajectory (default is 1000). + """ + + def __init__(self, + env: Any, + max_traj_length: Optional[int] = 1000): + self.max_traj_length = max_traj_length + self._env = env + self._traj_steps = 0 + self._current_observation = self.env.reset() + + def sample(self, agent: Any, n_steps: int, + deterministic: Optional[bool] = False, + replay_buffer: Optional[ReplayBuffer] = None) -> Dict: + """ + Collect time-steps from the environment using the provided agent. + + Parameters: + ---------- + agent : Any + The agent used to interact with the environment. + n_steps : int + Number of steps to collect. + deterministic : bool, optional + Whether to use deterministic actions (default is False). + replay_buffer : ReplayBuffer, optional + The replay buffer to store the collected samples (default is None). + + Returns: + ---------- + metrics + Dictionary containing collected time-steps information. + """ + # general observations + observations = [] + actions = [] + rewards = [] + next_observations = [] + dones = [] + list_use_local_current = [] + list_use_local_next = [] + failures = [] + + for n_ in range(n_steps): + + self._traj_steps += 1 + observation = self._current_observation + + # get action and local information + if n_ == 0: + action, use_local_current, expert_action = agent.get_action(self.env, + observation, + deterministic=deterministic, + add_local_information=True) + else: + expert_action = next_expert_action.copy() + use_local_current = use_local_next + action = next_action.copy() + + # Apply next action and save transition + next_observation, reward, done, info = self.env.step(action) + observations.append(observation) + actions.append(action) + rewards.append(reward) + dones.append(done) + next_observations.append(next_observation) + if reward <= -500: + failures.append(1) + else: + failures.append(0) + + # Choose action according to local policies to record for both obs and next_obs + next_action, use_local_next, next_expert_action = agent.get_action(self.env, + next_observation, + deterministic=deterministic, + add_local_information=True) + + # add local information + list_use_local_current.append(use_local_current) + list_use_local_next.append(use_local_next) + + if replay_buffer is not None: + replay_buffer.add_sample( + observation, + action, + reward, + next_observation, + done, + use_local_current, + use_local_next, + expert_action, + next_expert_action + ) + + self._current_observation = next_observation + + if done or self._traj_steps >= self.max_traj_length: + self._current_observation = self.env.reset() + self._traj_steps = 0 + + metrics_to_return = dict( + observations=np.array(observations, dtype=np.float32), + actions=np.array(actions, dtype=np.float32), + rewards=np.array(rewards, dtype=np.float32), + next_observations=np.array(next_observations, dtype=np.float32), + dones=np.array(dones, dtype=np.float32), + list_use_local_current=np.array(list_use_local_current, dtype=np.float32), + list_use_local_next=np.array(list_use_local_next, dtype=np.float32), + failures=np.array(failures, dtype=np.float32), + ) + + return metrics_to_return + + @property + def env(self): + """ + Get the environment associated with the StepSampler. + + Returns: + ---------- + gym.Env + The environment. + """ + return self._env + + +class TrajSampler(object): + """ + StepSampler for collecting trajectories from an environment. + + Parameters: + ---------- + env : Any + The environment. + max_traj_length : int, optional + Maximum length of a trajectory (default is 1000). + """ + + def __init__(self, + env: Any, + max_traj_length : Optional[int] = 1000) -> None: + self.max_traj_length = max_traj_length + self._env = env + + def sample(self, agent: Any, n_trajs: int, deterministic: Optional[bool] = False, + replay_buffer: Optional[ReplayBuffer] = None, replay_buffer_success: Optional[ReplayBuffer] = None): + """ + Sample trajectories using the provided agent. + + Parameters: + ---------- + agent : Any + The agent used to sample trajectories. + n_trajs : int + Number of trajectories to sample. + deterministic : bool, optional + Whether to use deterministic actions (default is False). + replay_buffer : ReplayBuffer, optional + If provided, add samples to the replay buffer. + replay_buffer_success : ReplayBuffer, optional + If provided, add successful samples to this replay buffer. + + Returns: + ---------- + List[Dict] + List of dictionaries containing trajectory information. + """ + + trajs = [] + + for _ in range(n_trajs): + observations = [] + actions = [] + rewards = [] + next_observations = [] + dones = [] + failures = [] + list_use_local_current = [] + list_use_local_next = [] + + observation = self.env.reset() + + for n_ in range(self.max_traj_length): + + # get action and local information + if n_ == 0: + action, use_local_current, expert_action = agent.get_action(self.env, + observation, + deterministic=deterministic, + add_local_information=True) + else: + expert_action = next_expert_action.copy() + use_local_current = use_local_next + action = next_action.copy() + + # Apply next action and save transition + next_observation, reward, done, info = self.env.step(action) + observations.append(observation) + actions.append(action) + rewards.append(reward) + dones.append(done) + next_observations.append(next_observation) + if reward <= -500: + failures.append(1) + else: + failures.append(0) + + # Choose action according to local policies to record for both obs and next_obs + next_action, use_local_next, next_expert_action = agent.get_action(self.env, + next_observation, + deterministic=deterministic, + add_local_information=True) + + # add local information + list_use_local_current.append(use_local_current) + list_use_local_next.append(use_local_next) + + observation = next_observation + + if done: + break + + metrics_to_return = dict( + observations=np.array(observations, dtype=np.float32), + actions=np.array(actions, dtype=np.float32), + rewards=np.array(rewards, dtype=np.float32), + next_observations=np.array(next_observations, dtype=np.float32), + dones=np.array(dones, dtype=np.float32), + list_use_local_current=np.array(list_use_local_current, dtype=np.float32), + list_use_local_next=np.array(list_use_local_next, dtype=np.float32), + failures=np.array(failures, dtype=np.float32), + ) + + trajs.append(metrics_to_return) + + return trajs + + @property + def env(self): + """ + Get the environment associated with the StepSampler. + + Returns: + ---------- + gym.Env + The environment. + """ + return self._env diff --git a/RLLG/agents/common/utils.py b/RLLG/agents/common/utils.py index 4dbcb54d..3e2aa949 100644 --- a/RLLG/agents/common/utils.py +++ b/RLLG/agents/common/utils.py @@ -1,70 +1,117 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 Xinyang Geng. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -import random -import time -import numpy as np -import torch - - -class Timer(object): - - def __init__(self): - self._time = None - - def __enter__(self): - self._start_time = time.time() - return self - - def __exit__(self, exc_type, exc_value, exc_tb): - self._time = time.time() - self._start_time - - def __call__(self): - return self._time - - -def set_random_seed(seed): - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - torch.cuda.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - -def prefix_metrics(metrics, prefix): - return { - '{}/{}'.format(prefix, key): value for key, value in metrics.items() - } - - -def get_global_name(name): - """ - In case one modifies the environment. - """ - if 'cartpole' in name: - glob_name = 'cartpole' - elif 'point_mass' in name: - glob_name = 'point_mass' - elif 'hirl_point_fall' in name: - glob_name = 'hirl_point_fall' - else: - glob_name = name - return glob_name - - -def get_global_agent_name(agent_name): - """ - For variations of the same agent (for example, Naive or not). - """ - glob_name = agent_name - return glob_name +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 Xinyang Geng. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Dict, Any +import random +import time +import numpy as np +import torch + + +class Timer(object): + """ + A simple timer class to measure the execution time of a code block using the "with" statement. + """ + + def __init__(self): + self._time = None + + def __enter__(self): + self._start_time = time.time() + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + self._time = time.time() - self._start_time + + def __call__(self): + return self._time + + +def set_random_seed(seed: int): + """ + Set random seed for reproducibility. + + Parameters: + ----------- + seed : int + The desired random seed. + """ + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def prefix_metrics(metrics: Dict[str, Any], prefix: str): + """ + Prefixes the keys of a dictionary of metrics. + + Parameters: + ----------- + metrics : dict + The dictionary of metrics. + prefix : str + The prefix to add to each key. + + Returns: + -------- + dict + The new dictionary with prefixed keys. + """ + return { + '{}/{}'.format(prefix, key): value for key, value in metrics.items() + } + + +def get_global_name(name: str) -> str: + """ + In case one modifies the environment. + + Parameters: + ----------- + name : str + The name of the environment. + + Returns: + -------- + glob_name : str + The global name of the environment. + """ + if 'cartpole' in name: + glob_name = 'cartpole' + elif 'point_mass' in name: + glob_name = 'point_mass' + elif 'hirl_point_fall' in name: + glob_name = 'hirl_point_fall' + else: + glob_name = name + return glob_name + + +def get_global_agent_name(agent_name: str) -> str: + """ + For variations of the same agent (for example, Naive or not). + + agent_name: + ----------- + agent_name : str + The name of the environment. + + Returns: + -------- + glob_name : str + The global name of the environment. + """ + glob_name = agent_name + return glob_name diff --git a/RLLG/agents/common/visualization_helpers.py b/RLLG/agents/common/visualization_helpers.py index 44cbcb35..8b2912cb 100644 --- a/RLLG/agents/common/visualization_helpers.py +++ b/RLLG/agents/common/visualization_helpers.py @@ -1,130 +1,157 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import os -import glob -import pandas as pd -import matplotlib.pyplot as plt -from ray.tune import ExperimentAnalysis - - -def plot_curves(analysis, hps, metric, to_plot="final", label="SAC", n_epochs=2000): - """ - analysis: - tune.ray.ExperimentAnalysis - hps: hyperparams to choose - list - metric: - str - to_plot: to plot best final mean or best overall - str: choose between final and overall - """ - group_by = [f'config/{hp}' for hp in hps if hp != 'repeat_run'] + ['epoch'] - dfs = analysis.trial_dataframes - conf = analysis.get_all_configs() - path = os.path.dirname(list(conf.keys())[0]) - conf = {k: {f'config/{_k}': _v for _k, _v in v.items()} for k, v in conf.items()} - df = pd.concat([dfs[k].assign(**conf[k]) for k in dfs.keys()]) - group = df.groupby(group_by) - mean = group.mean() - std = group.std() - - # if overall or final - if to_plot == "overall": - plot_max_idx = mean[metric].idxmax() - best_dict = {'mean': mean.loc[plot_max_idx], 'std': std.loc[plot_max_idx]} - else: - final_mean = mean.xs(n_epochs - 1, axis=0, level=len(group_by) - 1, drop_level=False) - final_std = std.xs(n_epochs - 1, axis=0, level=len(group_by) - 1, drop_level=False) - plot_max_idx = final_mean[metric].idxmax() - best_dict = {'mean': final_mean.loc[plot_max_idx], 'std': final_std.loc[plot_max_idx]} - - # plot it - idx_but_one = plot_max_idx[:-1] - plot_mean = mean.loc[(idx_but_one)][metric] - plot_std = std.loc[(idx_but_one)][metric] - - plt.plot(plot_mean, label=label) - plt.fill_between(plot_mean.index, - plot_mean - plot_std, - plot_mean + plot_std, - alpha=0.2) - - -def plot_all(env, - agents, - experts, - lambda_s_choices, - init_path="..", - hps=['lambda_s_eps'], - metric="mean_avg_return", - mode="max", - to_plot="final", - n_epochs=2000): - """ - env: - str - agents: - list of str - init_path: - str - hps: hyperparams to choose - list - metric: - str - mode: - str - to_plot: to plot best final mean or best overall - str: choose between final and overall - n_epochs: - int - """ - assert to_plot in ["overall", "final"] - - plt.figure(figsize=(8, 6)) - for agent in agents: - - experts_copy = experts.copy() - if agent == "SAC": - experts_copy = [experts_copy[0]] - - for expert in experts_copy: - - lambda_s_choices_copy = lambda_s_choices.copy() - if agent == 'SAC' or agent == 'SwitchedSAC': - lambda_s_choices_copy = [lambda_s_choices_copy[0]] - - for type_lambda_s in lambda_s_choices_copy: - - # get analysis - if agent == "SAC": - path = os.path.join(init_path, "ray_results", env, agent) - label = agent - elif agent == "SwitchedSAC": - path = os.path.join(init_path, "ray_results", env, agent, expert) - label = f"{agent}-{expert}" - else: - path = os.path.join(init_path, "ray_results", env, agent, expert, type_lambda_s) - label = f"{agent}-{expert}-{type_lambda_s}" - - main = sorted(glob.glob(f"{path}/*"), key=os.path.getmtime)[-1].split('/')[-1] - experiment_checkpoint_path = os.path.join(path, main) - analysis = ExperimentAnalysis(experiment_checkpoint_path, default_metric=metric, default_mode=mode) - - # plot one curve - plot_curves(analysis, - hps, - metric, - to_plot=to_plot, - label=label, - n_epochs=n_epochs) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + + +import os +import glob +import pandas as pd +import matplotlib.pyplot as plt +from ray.tune import ExperimentAnalysis +from typing import List, Union, Optional, Tuple + + +def plot_curves(analysis: ExperimentAnalysis, hps: List[str], metric: str, to_plot: Optional[str] = "final", + label: Optional[str] = "SAC", n_epochs: Optional[int] = 2000) -> None: + """ + Plots curves for the specified hyperparameters and metric. + + Parameters: + ----------- + analysis : tune.ray.ExperimentAnalysis + Analysis object from Ray Tune. + hps : List[str] + List of hyperparameters to choose. + metric : str + Metric to plot. + to_plot : str, optional + Choose between "final" and "overall" for best final mean or best overall. + Defaults to "final". + label : str, optional + Label for the plotted curve. Defaults to "SAC". + n_epochs : int, optional + Number of epochs. Defaults to 2000. + + Returns: + ----------- + None + """ + group_by = [f'config/{hp}' for hp in hps if hp != 'repeat_run'] + ['epoch'] + dfs = analysis.trial_dataframes + conf = analysis.get_all_configs() + path = os.path.dirname(list(conf.keys())[0]) + conf = {k: {f'config/{_k}': _v for _k, _v in v.items()} for k, v in conf.items()} + df = pd.concat([dfs[k].assign(**conf[k]) for k in dfs.keys()]) + group = df.groupby(group_by) + mean = group.mean() + std = group.std() + + # if overall or final + if to_plot == "overall": + plot_max_idx = mean[metric].idxmax() + best_dict = {'mean': mean.loc[plot_max_idx], 'std': std.loc[plot_max_idx]} + else: + final_mean = mean.xs(n_epochs - 1, axis=0, level=len(group_by) - 1, drop_level=False) + final_std = std.xs(n_epochs - 1, axis=0, level=len(group_by) - 1, drop_level=False) + plot_max_idx = final_mean[metric].idxmax() + best_dict = {'mean': final_mean.loc[plot_max_idx], 'std': final_std.loc[plot_max_idx]} + + # plot it + idx_but_one = plot_max_idx[:-1] + plot_mean = mean.loc[(idx_but_one)][metric] + plot_std = std.loc[(idx_but_one)][metric] + + plt.plot(plot_mean, label=label) + plt.fill_between(plot_mean.index, + plot_mean - plot_std, + plot_mean + plot_std, + alpha=0.2) + + +def plot_all(env: str, + agents: List[str], + experts: List[str], + lambda_s_choices: List[str], + init_path: Optional[str] = "..", + hps: Optional[List[str]] = ['lambda_s_eps'], + metric: Optional[str] = "mean_avg_return", + mode: Optional[str] = "max", + to_plot: Optional[str] = "final", + n_epochs: Optional[int] = 2000) -> None: + """ + Plots curves for different agents, experts, and lambda_s choices. + + Parameters: + ----------- + env : str + The environment name. + agents : List[str] + List of agent names. + experts : List[str] + List of expert names. + lambda_s_choices : List[str] + List of lambda_s choices. + init_path : str, optional + The initialization path. Defaults to "..". + hps : List[str], optional + List of hyperparameters to choose. Defaults to ['lambda_s_eps']. + metric : str, optional + Metric to plot. Defaults to "mean_avg_return". + mode : str, optional + Mode for metric comparison. Defaults to "max". + to_plot : str, optional + Choose between "overall" and "final" for best overall or best final mean. Defaults to "final". + n_epochs : int, optional + Number of epochs. Defaults to 2000. + + Returns: + ----------- + None + """ + assert to_plot in ["overall", "final"] + + plt.figure(figsize=(8, 6)) + for agent in agents: + + experts_copy = experts.copy() + if agent == "SAC": + experts_copy = [experts_copy[0]] + + for expert in experts_copy: + + lambda_s_choices_copy = lambda_s_choices.copy() + if agent == 'SAC' or agent == 'SwitchedSAC': + lambda_s_choices_copy = [lambda_s_choices_copy[0]] + + for type_lambda_s in lambda_s_choices_copy: + + # get analysis + if agent == "SAC": + path = os.path.join(init_path, "ray_results", env, agent) + label = agent + elif agent == "SwitchedSAC": + path = os.path.join(init_path, "ray_results", env, agent, expert) + label = f"{agent}-{expert}" + else: + path = os.path.join(init_path, "ray_results", env, agent, expert, type_lambda_s) + label = f"{agent}-{expert}-{type_lambda_s}" + + main = sorted(glob.glob(f"{path}/*"), key=os.path.getmtime)[-1].split('/')[-1] + experiment_checkpoint_path = os.path.join(path, main) + analysis = ExperimentAnalysis(experiment_checkpoint_path, default_metric=metric, default_mode=mode) + + # plot one curve + plot_curves(analysis, + hps, + metric, + to_plot=to_plot, + label=label, + n_epochs=n_epochs) plt.legend() \ No newline at end of file diff --git a/RLLG/docker/Dockerfile b/RLLG/docker/Dockerfile new file mode 100644 index 00000000..00f8cdc5 --- /dev/null +++ b/RLLG/docker/Dockerfile @@ -0,0 +1,29 @@ +# Use an official Python runtime as a parent image +FROM continuumio/miniconda3:4.10.3 + +# Set the working directory to /app +WORKDIR /app + +# Copy the current directory contents into the container at /app +COPY . /app + +# Create a new Conda environment +RUN conda create --name rllg python=3.8 + +# Activate the Conda environment +SHELL ["conda", "run", "-n", "rllg", "/bin/bash", "-c"] + +# Install the package and its dependencies using setup.py +RUN pip install -e . + +# Install gym +RUN pip install gym==0.21.0 + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Define environment variable +ENV NAME rllg + +# Run main.py when the container launches +# CMD ["conda", "run", "-n", "rllg", "python", "main.py"] diff --git a/RLLG/envs/ball_in_cup/confidence.py b/RLLG/envs/ball_in_cup/confidence.py index da13bef7..58f4ce8b 100644 --- a/RLLG/envs/ball_in_cup/confidence.py +++ b/RLLG/envs/ball_in_cup/confidence.py @@ -1,37 +1,84 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -class LambdaS: - - def __init__(self, pos_tol=None, speed_tol=None): - self.pos_tol = pos_tol - self.speed_tol = speed_tol - - - def get_use_local(self, env, observation): - # check if ball above cup or not, and check if it is inside the cup - cup_x, cup_z, ball_x, ball_z = observation[0], observation[1], observation[2], observation[3] - # below cup - if ball_z <= cup_z + 0.3: - return 1 - # not inside cup when above cup - if 0.3 + cup_z <= ball_z <= cup_z + 0.35: - if ball_x > cup_z + 0.05 or ball_x < cup_z - 0.05: - return 1 - return 0 - -def ball_in_cup_lambda_s(expert, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=None): - return LambdaS() +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + +from typing import Any, List, Optional + + +class LambdaS: + """ + Class representing the confidence function. + + Parameters: + ---------- + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + """ + + def __init__(self, pos_tol: Optional[float] = None, speed_tol: Optional[float] = None): + self.pos_tol = pos_tol + self.speed_tol = speed_tol + + + def get_use_local(self, env: Any, observation: List) -> float: + """ + Get the lambda s value based on the environment and observation. + + Parameters: + ---------- + env : Any + The environment + observation : list of array + The observation. + + Returns: + ---------- + float + Use_local value (0 or 1). + """ + # check if ball above cup or not, and check if it is inside the cup + cup_x, cup_z, ball_x, ball_z = observation[0], observation[1], observation[2], observation[3] + # below cup + if ball_z <= cup_z + 0.3: + return 1 + # not inside cup when above cup + if 0.3 + cup_z <= ball_z <= cup_z + 0.35: + if ball_x > cup_z + 0.05 or ball_x < cup_z - 0.05: + return 1 + return 0 + + +def ball_in_cup_lambda_s(expert: Any, + device: str = "cpu", + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = None) -> LambdaS: + """ + Returns the confience LambdaS instance for the ball-in-cup environment. + + Parameters: + ---------- + expert : Any + Expert (not used, but here in case the lambda_s depends on the expert). + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool or None, optional + Whether to use smoothed lambda_s (default is None) + + Returns: + ---------- + LambdaS + The LambdaS instance + """ + return LambdaS() diff --git a/RLLG/envs/ball_in_cup/create_ball_in_cup.py b/RLLG/envs/ball_in_cup/create_ball_in_cup.py index 17ed5a03..3f975510 100644 --- a/RLLG/envs/ball_in_cup/create_ball_in_cup.py +++ b/RLLG/envs/ball_in_cup/create_ball_in_cup.py @@ -1,55 +1,77 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) Deepmind dm-control. - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - - -import dmc2gym -from envs.ball_in_cup.local_expert_policy import SACExpert -import os -from types import MethodType - - -def create_ball_in_cup_and_control(orig_cwd='./', - device="cpu"): - # create env - env = dmc2gym.make('ball_in_cup', 'catch') - - # modify initialization - def new_initialize_episode(self, physics): - """Sets the state of the environment at the start of each episode. - Args: - physics: An instance of `Physics`. - """ - # Find a collision-free random initial position of the ball. - penetrating = True - while penetrating: - # Assign a random ball position. - physics.named.data.qpos['ball_x'] = self.random.uniform(-.2, .2) - physics.named.data.qpos['ball_z'] = self.random.uniform(.0, .25) - # Check for collisions. - physics.after_reset() - penetrating = physics.data.ncon > 0 - self.after_step(physics) - - try: - env.env._env._task.initialize_episode = MethodType(new_initialize_episode, env.env._env._task) - except AttributeError: - env.env.env._task.initialize_episode = MethodType(new_initialize_episode, env.env.env._task) - - path = os.path.join(orig_cwd, 'envs', 'ball_in_cup', "models") - - control_dict = { - "MediumSAC": { - "coord": None, - "local_expert": SACExpert(env, path, device) - }, - } - - return env, control_dict \ No newline at end of file +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) Deepmind dm-control. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + + +import dmc2gym +from envs.ball_in_cup.local_expert_policy import SACExpert +import os +from types import MethodType +from typing import Any, Tuple, Dict + + +def create_ball_in_cup_and_control(orig_cwd: str = './', + device: str = "cpu") -> Tuple[Any, Dict]: + """ + Create the ball in cup environment and its control (local expert) dictionary. + + Parameters: + ---------- + orig_cwd : str, optional + Original current working directory (default is './') + device : str, optional + Device (default is 'cpu') + + Returns: + ---------- + Any + The ball in cup environment. + dict + The control dictionary + """ + # create env + env = dmc2gym.make('ball_in_cup', 'catch') + + # modify initialization + def new_initialize_episode(self, physics: Any) -> None: + """ + Sets the state of the environment at the start of each episode. + + Parameters: + ---------- + physics: Any + An instance of `Physics` + """ + # Find a collision-free random initial position of the ball. + penetrating = True + while penetrating: + # Assign a random ball position. + physics.named.data.qpos['ball_x'] = self.random.uniform(-.2, .2) + physics.named.data.qpos['ball_z'] = self.random.uniform(.0, .25) + # Check for collisions. + physics.after_reset() + penetrating = physics.data.ncon > 0 + self.after_step(physics) + + try: + env.env._env._task.initialize_episode = MethodType(new_initialize_episode, env.env._env._task) + except AttributeError: + env.env.env._task.initialize_episode = MethodType(new_initialize_episode, env.env.env._task) + + path = os.path.join(orig_cwd, 'envs', 'ball_in_cup', "models") + + control_dict = { + "MediumSAC": { + "coord": None, + "local_expert": SACExpert(env, path, device) + }, + } + + return env, control_dict diff --git a/RLLG/envs/ball_in_cup/local_expert_policy.py b/RLLG/envs/ball_in_cup/local_expert_policy.py index c0f71ffd..795c11ac 100644 --- a/RLLG/envs/ball_in_cup/local_expert_policy.py +++ b/RLLG/envs/ball_in_cup/local_expert_policy.py @@ -1,46 +1,75 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - - -import numpy as np -import torch -import os - -class SACExpert: - - def __init__(self, env, path, device="cpu"): - - from agents.common.model import TanhGaussianPolicy, SamplerPolicy - # hyper-params - policy_arch = '64-64' - policy_log_std_multiplier = 1.0 - policy_log_std_offset = -1.0 - - # load expert policy - expert_policy = TanhGaussianPolicy( - env.observation_space.shape[0], - env.action_space.shape[0], - policy_arch, - log_std_multiplier=policy_log_std_multiplier, - log_std_offset=policy_log_std_offset, - ) - glob_path = os.path.join(path, 'medium_expert_sac') - expert_policy.load_state_dict(torch.load(glob_path)) - expert_policy.to(device) - self.sampling_expert_policy = SamplerPolicy(expert_policy, device=device) - - def get_action(self, observation, init_action=None, env=None): - with torch.no_grad(): - expert_action = self.sampling_expert_policy( - np.expand_dims(observation, 0), deterministic=True - )[0, :] - return np.clip(expert_action, a_min=-0.99, a_max=0.99) # expert_action +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Any, Optional +import numpy as np +import torch +import os + + + +class SACExpert: + """ + Soft Actor-Critic (SAC) Expert. + + Parameters: + ---------- + env : Any + The environment (usually dm control env, could be gym as well or others). + path : str + The path to the model. + device : str, optional + The device to run the expert policy (default is 'cpu'). + """ + + def __init__(self, env: Any, path: str, device: Optional[str] = "cpu") -> None: + from agents.common.model import TanhGaussianPolicy, SamplerPolicy + # hyper-params + policy_arch = '64-64' + policy_log_std_multiplier = 1.0 + policy_log_std_offset = -1.0 + + # load expert policy + expert_policy = TanhGaussianPolicy( + env.observation_space.shape[0], + env.action_space.shape[0], + policy_arch, + log_std_multiplier=policy_log_std_multiplier, + log_std_offset=policy_log_std_offset, + ) + glob_path = os.path.join(path, 'medium_expert_sac') + expert_policy.load_state_dict(torch.load(glob_path)) + expert_policy.to(device) + self.sampling_expert_policy = SamplerPolicy(expert_policy, device=device) + + def get_action(self, observation: np.ndarray, init_action: Optional[np.ndarray] = None, env: Optional[Any] = None) \ + -> np.ndarray: + """ + Get an action from the SAC expert policy. + + Parameters: + ---------- + observation : numpy.ndarray + The observation from the environment. + init_action : Any, optional + Initial action (default is None). + env : gym.Env, optional + The environment (default is None). + + Returns: + ---------- + numpy.ndarray + The clipped expert action. + """ + with torch.no_grad(): + expert_action = self.sampling_expert_policy( + np.expand_dims(observation, 0), deterministic=True + )[0, :] + return np.clip(expert_action, a_min=-0.99, a_max=0.99) diff --git a/RLLG/envs/ball_in_cup/models/near_expert_sac_650 b/RLLG/envs/ball_in_cup/models/near_expert_sac_650 new file mode 100644 index 0000000000000000000000000000000000000000..4b0ee88ec0f8b23d7689ea6e139ee30f024b70db GIT binary patch literal 22735 zcmb5V2{={n_dZPKDJ4@xNQFunE9X4x7)lBmN>PSP8KV=LBr_SB3{jawQX&nC{j9Cf zq|!i|C=Du2k|r8n-|y%BUH||0|GvNXcfIGj_O-9G*E;vU_w(#$?X}jvkE5-Suz-NL zxWND8GeJO3z}L2iME2te7S!hCq?q*o%ss? z{Z$+O*HtV22dE)mDR7MJgh;;fKV;2Q_>;#tkC9dR2U#_eKgF4^_TQ5=)c-eG_5Z*% z;!pjDtj7OMR`XwEwf=#e7RlFk=1>3c$r}F`*%|)e4T%x3fqp6)%}O8 z-ap8*BKfnO`E&kzvc~_A)&CD%6aL&Wvci%4dH;q@<{SKrtl>Y9Mv;7DXTHh5Aw$Ch zydom~yf#FIMh0yR4f5Zl?>lz?MS6!t@=gB((v&}c404R=f-y*4-G87KM)J*^`5afJ zv3dV5sPOgcBm5)(7Yz45VCM75KQQpW!SEva=FWVJs2IMbtHcTvV)#n~|80{0KTQ9%WcbTs z__qJJDE>Dt`t$g9F?{>~!O4KXJcjS^KR6lk9b@=T|AUhe-#LcA;(u^5=C6$5yZqxM zH%9!w#{7RxAAeO0fAv2Ozc$DJHoT1&F!_)0Hg}%2rNRFZ-2Nld{l9;?n!b~m^v`rZA>~=MxmS3Nt;&u_T zH@TZ!j zC75G6f-1m6GplMy+xrx_w3OK^O?j$a8K~nGz&pd;!w9ae11B{pSk_ie`b4UDb2O$R zrk;nU2kEf0(~ER&S&4?dcktf22%6Nbz`N2rAC?`rgHv^Nyoi{?ppcpek}FkV(v{uZ zNaavAdcQDlXjp}v`t><{k-bDUs~zCis4VYwyEj#tUdqmuS^*t<523&OO%zC)N~68| z@bkWj?5?kiA*M%|-I-j_Q2`SOq6p(fU)j!p&@~uhKbNxuUI*@`p?c@(4(D;W#N{xaxE6<>Hjm?d>~e=<$p<)Hy%dc%{iOo)#Moc&+ejvp z&bC$hv&{ygD6hANo!*)PuQqG2+6IfU?`J7oN)IC&)(G)T`Z9SjueY+vErcA4KLC}L z?eN9_H0TFS;=Cj`{NwGKb6IO)G`x4S_r`=`5U0E7{IGD>S?Q55MNc10TpBU9_b9m_TE)zs*po4<=}c!ncH3^|y`7$h9_v@~Vug$FbN**gC2QE=%VO+{ zl?l|P@e$yv)p#6%{WUv@x41Wy9u8IKeR4fQL-g9o#G!arGH)kHy^?BHE1SU%*#6>* zPWeF&}Ndh zs8TKO&;3o@cH1}Hojr}1a6^myamyy6cN$o&)KBnpcqyx@SArH5pK0jNa_D(c#Ys@= z<`sCP@{TX4N7eJTyncsipkK%XW4Q(3biaW-YxxXKC-vA|lU=ZIN+|w$u>opdrZ9!Q zn(UqAI5KI|H@NlbFtzTvh!=jCU1ks3fDnWB#igz%2ANG zpGm|!deQM_5nZ)Joj19yg|53{#+w=K&s)+}!VC8@frAU?f#)0v_OiVgYf+NL>e(kU za{gVc;m8Yo-5!qv>lX0l6kaC&p6Bp`e-#;YSAMr*bx%%88wuFNW8rS{L}<;wsJntei_iw|%yzX$dQXzEyO7-G`aEVZ@2E20pD9H6Oll9u_FC;f0=d0&C~3Yc;O+I$uICS{{^#4B7ZFUCqNRAh4%8}Vv%26Zc?>}<|Sw$s3gh2J6UceQHL zzIhM)Fj@xpa)jCFdG)-EqUYEdR?CY$9?xrt)+XsXt(n&Jn z=8^Xo;aO{OUjMyXx=7TT^2=L@r=cG;OIF0ZHyqx{S({ne$O}Lwf5oL)Nz`J51FHnz z!t{tMa1!EaMpi9$+;n1(Z_HqQUnbIp-Bw`zWIFG?aVKtDL0&$Jm3R7~&(f8Z7@-oRn?okN^cK6^Q&v#gWGnAK1ryT8|8?M{ae)loVPI`3ARyuQ8|MkzLDaS*)Zk_;^Y!m;=zo%mNngd7Iemwy(w@J> zy+s#IjE_U8S2W0|Il$k^ZWwZQKYU$N2&(rw>6f zo5LV}{}Mf1lh?F;%NAHvyet$=oc?V(3gKUzr?Kp}Adth|gKK62OE|G{!p=Jz0<(7%`_{V;<`#Oy^ z^lpJ!OSM>yoMlk>vlz6hh1pj-PB+D_38t%B$2S_ARM8!J`^fzpohW%FkUjWB2gCl} z;SNS^#}~e}FtUFNuW0u!GI_T*2;cZY^>&St-5b+U%)J%&ZxKZ4^+)Kok~PSESx$cl zgwxc@Em#pgN@SI-;h{w#ZYi~(W}hE|XvS`QYF`SqS^3zWy9yI$%d^>D$*{3vKS#1g`%{Uzv$6HZL#z4v%Z zBTd|>*OaYT_h<@xFSLQ{U(ib5h)0>(*c*XwYX?)*{)scOPn6o7Gr%IxSvX135u;I* zeQdatL_5`h;$LOf$l?Vlxz~hOR_URzMIQCf7iEtgNCneZQ%G#Oaw)$` zfNkGljh3r^P+oElt!WbH)?E|DyM;Df?ShY7TRS;ouRlb89=l2wM7Po-XR?Tr>_mK4 zHW5EQ$smdPhna`9Sxr?BuhZ`(wUn8eNYg(}Mvb0Mj*X@m9jxkM8b*}4l57&Wtf|i7 z1qRb|)h+Zu`CGbl&I7v4FpQ3LZl~RTlslf5(c8Zq$etYo^jxG8&HHeS+E?4 zE35rXzv&TXH19DjTYG@sJle{Y@0Ugm)jv#UW(9X&%Tk&Z=0Z-si=q1-w{z2eA0-ug z&d@HY7hJ#n2dU{ETiR5*gq{ylq@g~CX{RO2Oy|@vcMW(XPuq^V?HI;FwRgm&8nHoz z68A5U=w%aMxc%FSefujE_U|ji+7XOjbM3iojKluWWLx|VF*akksU0JbMI zk#KV}{Pk-J8P-QssV{}TlL|yaX&&sDHW>;(IkM8>SBTabQ#>|Hg>>n0VO57LoG8g5 z`dlOWx7*L`V|zJh*zJR(H)?=$Y%Cww-G?6@YBg~L{-UOz2GA&BHmuPL`s;px^SN#k z*A{`+pEdA8v=*&q46JVy-i^&NP_a259-fQ?<(u_*;E@kJ6fj`#CtQQ7 zvM(SpVH^hWSKz@H1SdOmV_Ud4^}Lu#LYFfD4}&qgCkwB?w`IPac?l1c3PC4gJKLu= zo(-8IOxBOgq~n5IU~*^;#KnGNxcaHs{??Zy6(jL@dyx4OFU!-jjzou|r})<69(uiZ zU=Ma^!n=UK*uO^&#;tc{m+Uu08%t;OSs@QQ?!Tc&{Y81Vg})OF;eox36N`hXz)`zM z*X-TId=%JAd}KGkrI7*JrT7SC)U41ZYB#>oGbX0*Z-CwtSvu~V5G)cDf)w2bW{2-3 zOi}cKSJ$?q_oo8JwrvZ%x^ol7xLq{j=pe}N(S!=4DXeVccFy#&1`zZuB%u{2I3@bA z&>-S~_a5oeN=Z>xgwciMrbaq9I){nV@1Sowc4K3iGxh7q#F~z7uv#d@9*gdwJ6lU| zWJ?ZPmg0v=14H!a^dU$XXuurp*_bY-gF6LXAicgFGZjxzzhyDh_%?^^NY1B|+Xt!S z#`PrCrI$n*SJC2;b=dNA22A=DM+&Bj)B4Tk)JkbNPX1~I-|Yn0KEVe1rreit%$N&n zJ)FR^>?DnPDTLRQvoGaJlb1D67P1?%J6Ya#u9m~ zIG)_7zAlE1E}6?t+>?ghHxw~uYcigDdjxFcg>hZwLE1O_H9S3cjyovw0IzZE*>e*$ zac)NllsHhBx3h#K*IB{6b=h#_bvpC(!9mt1po_c>?!tz`5^Q=Q14aRkyh77nQbVl3 z@!k=5du%BzTT=q7r^&JGxRuxU`@ptB0W$>htALE z9h=|*+ZW72`$8ca>k|a-9+_zCHJ6R;OvLHp!aVIO<9VZUyFhir4U|&$BnDxE?C2i` z%-j4(`ke|$3falVu5Q5P374Sv(;i&Z>_G#!2(qu@5DUoFz1%~S`p<0`od zKgHQDEqSo|FcJT3{=mq_%VO$=X)tU19mw(br-?@t@c8>K?7v?Ibzv2>ugiuUjTNAK zIx|66(VfS37eZ;-G+bipfU1Ai;rz9Ruw&6ah<<^ZK1=HmJMzvS171$g0&5*#1RAjLIzsc?S+CUTdu6J?4} zejOhgaSSQ&+azu*t9Kpkv+sOUiBq#`aO!B_PVcD=SyG>n@R0*y`N7_P0T_3zmY%Ni!=`h9ZKI~T7#QDBaXe&L>U(c`H*Z$aI) zZDePTHt`IQ2mY%%s*zENgYP8C^r_bLwdxbx_5L1N7Zn1&E#^&M*DVBzoF-J!K8vTS zW3m0%eR5)^Fl{q;9dXB3qONf}{0Wk20$ED?Rh{Yeq?3QI-V0ed-IBZe8PZZ zg)pRFOb9VfVw`d4X$`bEy; zn$`5jk~9*TP)m$HA17fKQka^}TvB&S5l=5`q4Uz!n6Govnf<(9bkFQ!x}l(&hKx=? zi73|WRz@V5JwcJwFT6oFL@45_>mTXC>SAWS!Ez#+y_Ys0N~il{>S^i8DU?$?0c(4{ z(viUXD@ueS%Bp1&HC{goX4&h%i=G zEFt>KXOaGl%Op(Kk@}q5z~l#@*{ZW&@tH*)1{mi;1izho;HJm4M^Gi7WJ4rWN@!HSWdplnY48d$Uf4B9~VEsY=JnqWiuJV4%*Pqlz_&7=zR6OC=TgF4pXb(O&atKRqU%Fu{zh9?QR*tHUf2(heh$-5%ZEto!t+r4 zU>~F_j6&bcMB;vF9qApm!(aD4;uU5V95R%J(vO#6|I~-@HAxpX&UgrIQY~a3a{x3` z9EiA0868zfgSSGZus6A%v%=sfy&tGZl-B=%+}E5YtJiV(Xa97R`^MnG&Yi?fH5_Vm z65)u59|)b@$nBa|LX-Ba&iZu{}Y`6yK0l#9;K1o5_;B&Zr*r=iJ$ zn0F~3Q)4XwAJn6}_X-#p{s_k-&(JX4QfBtf-K1lW7Y6m|64!cVwwv<+6ed@~&*Y^f z!{sjXEIR`aLLY9d-NB6T$GSpm{m7j6%elJI+4yYXANY#Tv8}}k{3d8MopPIlV*DzI zy7r75ymTIxued<3$=#wqK8v$U9E*u#X+J5NFTvjDO<~=#vmxfW9Xfse)uev&5?+6k z1mj#OKEC(`T0+;u)WvV(57gjc7O@ac zgOqLwbnrinyIsc>dJ6$y%FcVd|VCEQ!b(BB_SA_BZ17MUz;~ zrRSKpPk+%r_om=uU29l2uMP&IoAA=DFI0VXDjV%R4-Dt+#n^*d%fm{D0xQ;r&cIH@ac;f zE0#dtjOFS{mtt|mav^@xb)_3EPmzx9ub6!+nOf-vbH^S!2df#}uBAoHfYvq8loW&w zb=e^39mQ&9q@s_ACHb;)FSjiJ25x_!NfH{zV#JzAe3+ES?GIedt&*z2m8*Kl-Gtjv zuYLedijCDxVL^~~Rhj&d702{%6WBPvKd7|YALC=3QF47hm{qNTp7kr}**n${;@N}c zZ-qcE&;|mc9mp#9K0c>CC2VFhuSyk^!Jo$-`f#Lz;@l_m`+!y1_`k{y7ckV#t zP8DXlX(2QHbrU^s`42-j7_bN9=HmU#bnc&Dx4CQb;xRW_joOZ$CU@p{ki5*rU~RSl zzXzTLhd*Ytxx}1mygEjfoN~tSrqg)9o#O5KcKEnSmYG&&!rl=Ey5a2(aGXE3YEBfx zsz>3tVbc_tveKHw3#t%>EzQ(t<_O(vJOG0o8FYo+C9rJ!3#MbexY;k%SxJwhsAyk~ zCBtWFzlR$+Xo&%h z6=H#m6V|U60bldwti;C}=$Y!QG-fhu}5rg=MEm^zt04P_Y`P(S|;30OV}Z0~ZxUSu45w%P$x?b2~LTMd;82hhG|7MfS>#{L=c zWa*wdA|DWoIX{4nKBYwFgk8Y*{v146z7)k<4l|KvGeJedmd$P5i%wZraJq~cTXon2 zJNMs)fM=PQWBrp>n)6uy%ug`weg~7Uc>*(rCb1&iX{^ciQ^eb602fDI#i7NQQ95Nk zn|UY(hWT-v6Jir^-h!uG6H_jl3x5Q+keRGRp&qMuDVLPx=W*rP`#9Hj16een1@`x{ z>HHlP@K9R=F6xa^dDCR9lTgL^(P4COi#YJJOTpm!GZK#eU|ly0>;gED>&pi%!(i@6 zsRM6c>ON?0UC6qfD~1i9(m?j8AXIqdvg(E{bh`XejBC7!Qo*`#+#{Z;^7Etm<4``XesC8& zdhbxxDpmI2LhG?HDuPV+%~V5M68eTJ$x33#eq1>pw#rA)1%DQ!NIz0vg{wH@umeqL zX{5Gr8T`c?;Rnfp+mEJ_Xr>U7!;-=G!7}XEumrn=-N??iURYBXxDz;*p7gkn7W_J#LkgYp@Rq{?5VcN(&-=3> zq|z3=rCo7LXAyMFoDBlynvmPlOZD8lXv&#AfKPbf!YP9+Pj#~KQ!ywmN`;+H zg&=8OPQ#dYoN8|+VtI&1okHi*pPf;d?XrX`kfA}9uUgTdmrIfLTnntbHyruyP5RX; z;Lr^Y7+UV4Q7hBg7jv(H#PBXSFl8@!E!qOHmlBzdj9S>`P)5E7rI5AS$GPj5R>Mr; zbMW@2Fm}4gF~2z4JUN?Q+NyL9oi03rqw_1Ntfd$8_RJ3wo5vwe3n!w-Dpy?R&xN%; z6%Z%kMr;nr;rFPcv}npVU`F>sRme?z+`EXZYdVfXwuw)W2kdSuoC+RfshT z{TzA8cUYtxg&ti~V3VFFoc(+s=J(q`nX2tr49p?17%#9>mGB@oUuCABm?R#W(@<&TK+!)gauu`w8t@SdO13-X)2( zCZIi10keZ<@-nJxsZg6eTbFm9s+3QLWHbdgn<7Zdl|-kzqx9)|q(_UV;UklmIP8}J zK{q|PBKD~mV(^6V^Sn!Hqh_%$-m&!LXdkzH+!eZbw?Eilo5PUk1QdLC0H&OBhTcgV zA-m-~R6lNlQ(pxhA%(p{SS0;yp7UU(e064-4ou%w#Me&1Zo6)HlJE1Ga1Dv7E~ z=Hk%$SEPQgH59MCgn=?0aHjAep7~mU0=bj%Yoaqb7h!EYi)LOW8180DzwlLYW9#(BzNj}VR!hr5JBJf*+ z&R%CvvpnlDf2TZViS!_!M#0*@CRE$inHuY6(k+YDQg@@}_*`8Z%R2~kic6Ef&t;k{ zE1T)-dP(+!dn_GUaGZ219>BjRd4yNuikA-GMGLwVgB}hXfr+{c*DC}~*1Ntvd6a62@p?c|bdTx}3 zQ`^>{6=xOsYoi4M?`9IkX=h-*@mnAX3%N@59i-uVB3W)1PNO4AVSdMQu#t=H!5i+3f3jG~gu&SpREbi*kxqD^UZHhPO z`&1R^`yK~8H)nQI&Tc?e30B4|2-;lzXw+ggw!W_vt$t6zM>g}x+{fuKNYnXUU}OlLM9#;KiLUXws7o=SP=nlXbRSB^b-NYm)aD~> z*!D!w520L@Z*}a>@ayDqhzf7J?M8gINDN-D*hFgl50S?%jbz%MSlmCg7>_t7!ml+m zne|2MSo@8B*xFfwcdyKYWY|k?z0YHps7#{Ct-~mNegLg`owVQP0ZcFXhWq1lU_(X< z^zQ!5c&)z!dD@qW%d%+Vq!$O;E+h1W<7txREyLEkO~lbNFRA#A=a}0}u_rJOr#tq+ z(Y2e&{td@cGsY5Vubualba1gBO`tV!zewyzf0tyy0!Bp)W?Y>`vccRb2 zh8;7=){7-LGboP+bSBdGibA~b-g&TQOE#INR|Q&I%W-jQDHJKx5e_ee9XxR#?mYR8 zIeXIS)pfe4qgV<*Ir6N;7hNUZEeeEEKjDBJKMcKo%b zzU3M?5*J5HT3U$AkB=}b>Jj(yP&BuA#%sF!;W60kVF2X7BYf78(^M;(3qG%lNo&S_ zP!CbTUZ1__uNRCvJ$W|Npdf)>62>bAkxgP5EC*j-ekLWWaYr3Yp4sGv+ z!ZrgwZBj_X_KG4%R{soLQz&W z4bck4I}c~DbWE4n{XLE*)l5YP*PYPVCC`S$mEqwQL3YKCQ$#Ye6xD+=$<*1`L1l3o zm@~SV6FmhCO(sK;&?o9>rOej(%wYd2^wMg9Y3%Pkf9ba>eR_9s1saNcA?gOYtV8E? zDBIJE(Z_9ZLa-~OODv%=ZQJqdNh`*D{t-O?PLJ)Ep3Pd0y*~*&^p=q{+KH29nxSX2 zE9_NnphMceO^Xx{;%VhH*57tytlyuH_JkfE~Q#OSpL9#rPl zBJP)%qvZT>33t-IVXl>zA(_<|#B~=AB?h8eq|M|sQPva3Ge&bs)|3uLrk+r9$wGP{ zU>zBJcbvR^ErmBH$&j|KS!D2gJ@?+~RA%5*H6+9^@UXM#~}i?@>r*4 z%f00$i-GrK>E%_&!AP~25g#waTXIMg^Op^9S4?gvJsrx-ev?Azu8k+Qh3Ys>JOsJ> zm(s1e%LcGtc8fPK) zvc+D~*yDklUR%L?!R`30CkgIV+LO6DT4-Uh8#bP2!BFTtwtir#Yexz&+6u6_(*W-s zdPqn0Uc**)Gq&0V(Nh~%kY~CM?Bz9lIJ(po7XCWK-PAD&WuF|xkw_QFT6Uiv?{Oi< zQmEV(1!M)TUa@s$bI}@Pjppaa{pdo4epxSr^IrC4}3Q7qX|< z*U`fxT`(_FkM%zz3?@3G*y@_fbb8*WtA6L;s^)13&9^ziUBdK+S35kN8V_Dw+i-bD zJ_O&02fpQA;*`|a#1vLwd;A1mo5nKu>0C=nQ(n_+=C@EsrIB=aE7MzD=5SeO5<9p& zh80XIhhNs(`T$ya`buUBq_ zKc>f+&Q&4F^?pg~or1NRxVt=gLHIMzI;sPVD($MeTFcFTuN^jGxtXJ&` zlzuA0V%-Annb$9I`-gk5WP312RYdEO_8VXgxTU(0|V$9FWOyI`!Rdyftdb<$aeRfM-T z2fiOlL(4f=K!4Xbn2W#g;L~@+a>E|tTQC7P_Ae(Vb9@=K(Hn5m>o&JylK}haRUdJP zP-Uwu>v6@cbKDK4s@x?Cnsm_B6NEfd(7kmdWJZr;ODy)o?#k_?Qo9DfKXt*Ao2RmA z1#`&jhrT%I7>OpH!?1gj44iR|rM3~WfJp36lk3_s&#D?ywV3w~pgK8##cTL+KlLS5C zgY{cjGQXcj^xfy$YVM>@_L(tu<6Y1xs)p2?U&TZ>dG_7UcC!AUEV(qR06nkHN9BlA zi07za+-G}u^=%4F->3t7PLK63&LqNi+iW^i`UB1|m%!(4F^M0EBc8>B%w?4+IJ$N= ze7hU~k>l0478iDtYgcWsTT~lU9a+wmryJ-8>pRq|TAlY|vlVLJc}cCMl_Bkn06H36 zgsvza(FaYs`u25Ld-4aftLp^VD!(J+4J6sB;X^bcEr#0MNr&YxaZKY=kB;4~r2JtV zJo5WP*PRjv!>CwXWF?4N7KTk3lHXzLMrjZ=`Ng$&J4hquw84{+a_-zcf$(=g9s2j@ z;`Q8{s9dVcrDIwUj^&r8JFce~InfPt%Yo1)jmAS%^vxw=>u`*CZ%O7Ru02Qt&QC!5 z*ekR!^fvk8DvtWyQdr)jLZ>Z$%yr^SM2%`y95nA>E*NIgK*8f=SFQx3X(f%X^T*-b z=sjfOJ9+9i;zg5F1h^LbJ|^!C<*w+l;!J&9NGtaIq$iry$+)mj)Z+AXT6yF7Lea9D zwCR%>F`amZW;rEraIB|RT_D5VAfbi{Q(iZvyGoO%7Txr1)KNyjU%;#@){}T2N}{G= z&zZE;!*o?t98Dkn&gqycjY@$T#aFe%W$Sz4NcNhWvkB&5{>g|WS)!?481EM&VO3)Qhplh%vpp# z1B@_3ZA@?QNP^u_Wl4X8c@Zt8CAd(?2JepliLDNnRJT-|-TcxWGo`iQ!Wku&xG8e2 z_8f!VPk(V2UGvAxWkdK%s)wG)^aJB|HB?aYfzXUf?v-R0;0c|;E+HxOzxfDN-miv% z@Homw{(=Yn-^9Zgruf}Eidx+qMJT^g|w*2cUiN+pSq z71vH(D@=JYZxwM{(N{=P7T`TmQ^SfIf4PO_FW_MIGmNWeiPHjYC<;CZ3N_+vnebfJ z`R)?BzseZC3_4SBZAsSW#W{F*T^jfv;W$gOox}t?!^LEH=VtX zAw-HAx#riH$N#>H5)t+o~e;>j^2ecorh;p1%9QsfQx3FMF8`1!y>q8m zvfSBsX=ZrRU^CqzQH;II(z!G426GqLT;>E_t!2iQ40FAz7823>&l?R<8MC*1K&zxF zV_E_^Hr0`f=f6Lq{dbO0>pc~?SN9~*{G5nZ*N<^a4aVUDi8OjJeKVd}yd3+Ed?F7{ z>%!iXGq7x^nf)datIA7VUaJjcjM zimY_YXKH`?1?NMdAat!(#aC@M7_fK(-e~>HJO~fQ@#Ak3{hHG>`1fWy&b@(P_B1lE z{SVE&T1v5bHNAH!k}FUd)bxGdUNSNg&MDU@qwE74dS1Vu-qs$AY0E^&RP#kPTLY(5q4OQwIl-q5ZUPGo{>1tYXokGrk$h1nKU zU+#k^M~L~JPL9AKPiD&Uxdg}mSXp6Ym9rUkZtsibpPQ{j6l6h3o` zo|HdHR8ppq^S--?m(h5}>a;BSK^UFtAvsnD6j39?+g#TRbLh$2NIn)c()K`cv>3QV z3Yy;0UpggZ#eo_Qb5Q`}e)N)6GkxjBb4Td%6LZKc$zWKwPMOS{v5Jrd)A3MMEq8K< z80t0}!!{cWxEL8vuiBZTuZsk3o+`$=DQHlsvFBd-CvDWx=A*+KLwde02UnQnnl*Fx zqu|b2L^C!Kx>5#6@`-r%n1K-P(p*SBTu{Sni;s{OI?CvOyPp;AzngSqmjbe>kMPR7!}R>KY?u&J zOC+!DoG*(!taC@P#8b2xWk%Ltj(b z<>jpIMPvMV?*usA=*P|(IS?{oB2P^41Np)o(=8^qIESoK6kzA2-NTxORq)5ja;(0617a%Q zu|QRr)f(#`{K>Y!SItR~yW%Oy(q4-e50jfF|JX-&EuI0vfq%Jl@E**0=mM!dkLdp4 zpJc!{g`34x5n0n}=CtH`xE{yFkOm_-xUd@r)EK-JseTT_HlhEkX9KGTV?~0~h2czJ6%Z6^6?xt(i6((3Bo#ebwJToU>1IhybC;mn_+-Cm7ND8yrs-||9Nm$r3QxU z53@^GiDG;49&B~d#uH!a=(n~s?&mL(bWWBqv1_S84(%1`pf5_1yp!UU_zp2fma2?OxFjR|Jd30k zm_S6o6fB&Wj1L4o*<|S`uG4dUZn5?-dA-$|wmCf_PbF-z;mI!Iulo=dzTL;5h#@g{ zx3V$AqNz=H>0T$TdKZ54wD!ez5n`HE$&s}oRQlF< zybx@O^=QUD(Naj2uiPWu!7RR6slpb#6lOd39i~xK10@3&(Fier)X<#5oql!-4a|Q_ zKXm8eqUkeO3x{Rwx95`~aEk$Xcx8|~@%mX3Zz@P0u-E92X(7H=Fa&+sCX&*dP5P{N zfL6$Nd=;99hEk)13@*f~EBV0o_tFdD9b{@oEr~6Yf-}!F$<*5Qs2ekpSMPnC6x%+4 z@y?N?xi=YW-c3PmyP4qjJC%A5_0iFSb4-qh7<>HmN#^zOEc~6FO4N*e;feACYPI~6*KLRLNaFX*5x3w0smqB^T;Pa*v$4}!`# zNt`cq!bj#o_&LA;HigEcAM?hUZ3pm#^;I}z&%lPy8CqR7o)iiy_;^4NoNbW-&!QMQ z(sV4V{6mFFy*@=mrn*Gaus^$yp+V+JJHcJMesJ<-9C|%3!reytdvt> zo%uDAGHn$zTe^ik2pNNc_udk}0k`18VKtgF=Np-(yb)&f3x+j`fpo!HZLwLRBAY)u zmHC}kVqFXGpw7BR_$e$54s1=O68mn!A%!t0%oMV_8-4MiwHfLzO9mIXN*}%yh_0pZ z@q$50WWE96WL9^9g@_;Pp?eh9bH*Thn$I`5UkY(1M$|L)nCZZ=l5FAYK!l)W5 z$_v_ua#vnc)dzQ|-EZs3iB=sp?D`OPZb%La+rrtkRYH8drAs^|@(U)unn43vC79a& z%aE+;z{lX!5kyPD`nHUkQ4LCWBRHD9&pt z6%Fq{N?dU26kIBQMc(JKtIn);MPTGW#EnbNTI zxB>fVuRNqai4uK}w!sB4uJAHlj~>4=8pb&fCzYFKf$_u~c;b2;!p)-K{aba^6v&~z zj4Fvd^c^1$S`F!MVj-(zCs`nA%uO^;f~N&GAlR^nNh$}3E?<%$6Q;bN>dJNyvQ7&g z!yeSrw-BElYA1fTU6s+-2QfXO4-5X@7cQwU5$mP9;^fj!c&PKCXhg>PXvj4(7+5#{Bc6=(R6KAO-?%&)yel_-4|lsghv+ zO^!64J%vktK4TiSf$ztw-sHy#WfTJ>4UEa6cTvzK0UDb)(H*gTUoN2vOe}g(b?jL>@Upx^>E25|?6y1$zxJl<2Tc zdDFnOUJvSw%fQ__9;ZGY#crQ`fu9q%pxpHAvIO2S z%LXCehDEf=;HyQ8alelolM0F``VZ3%MbUE4JV@U!Z8hQt9lfztH{Tj<22a( zN>%RlIZdV^d5+F8^gumjSCkIO$J5zrbj%I~C^KrMNBtbwSpM02;Qa&$+Uf}FuUyBO z_6ymlB3;apkH#&Q%i))C@pzvf=Ov$|Do$Ic0F{cT#9mUWa7g(*z3bITRjXx~z+98a z+zN$#uN!c)%p2&iTnWEr$-tpc6QJ)TS8;Hq6$!|d!&m+{Ad^c6eQqeSpG|Rd&0+j* z??|3%YjDA(demB*plawLI1}#)7ngs8ti@&EGXEB-x#|s#q9*YCl1o;OD+jIEAy9T< zJlnm=8qRAjfD-M!q-eedEKUys$zO8Nb8jDbVRl?R{0e}Q8Ktv@2@vgH4*lEuaA^?> zxSb!>Y4zvJQ0<%!DnTBsJ*bjwU;B*g`Va(-GSRR(K9D@~=%NK}W4Xeu*Qu`CO1zB5 zc&b{3+)nNbPl_XmiqA>Zx_**gYjDDJ!S7fY7K(d@El2a=F-+dv7ZW`C(!>pSiKl%u z30f3_1HaailG+s7CXB!Nc<*W#^FZ}IEF3+HX8<>8&0flRG_5auRc zrg}#O(Dgc-9CGYJsauZtj_-qa(Fie$J_r5N-JnEhOB_xq;+JAEDw$am;Uy7R z-JL;?m}%o}J3IPDDhW^5MiIY?dl)>sktBT2Bm=%ifbYAd zhT{T~H**^_PR*mK7M+l@@f0*}_k`y9U7&4afdey-5c5YNu*209{eu(0h93i0&F5Z? zo%e+(R3w^eJl0}Ir)EH_&kiz3O&6~7eGj2eve>0!DN$~+5^dh*h$CD0cgb>nzg^86nMZU&fud&gA1#jfbIW(qVL)6-jEg39&vWnV8gNv0d4Ibj|S;Xf(;j zyu?T_48DQ;w725=gP)<_uG>&|51?smIcf7KH2pr_jc9$SrumEbY$X3bAe6txONy=$ zEm0x12}Y0|?gDlmOmz_;Q)+#UB|RyQsI`WsB(uFERY z+}BFTRFmNN#=#`0)DqGpqv-m+vQV5VB)=w{r7`sjQOC;=(sv}_C^$`P4$q;ZdHtGI z8wY}8qL}Pnodiddk3r|QY<%+|f}UIa9(^UmCjHG;4TNDZk=_aE5{INoqdB#p?;4HhFI7qOVsRK0tq4H1pr zS!5`}H8 zxBkNL5{&&n=r0&_pTFr(ZnMBgOWC)LHCV+-(|vWB@W%WgYZxcTJ^CuabY^%%^^5cH zu0g;Wwg9hIDMunx7qx6 z6i$|-*Cf*6Mpvn*QE4p~^LQesQt$$zuKVDp`-AcOg}Au3@h+ z7u=}Jy{|vpx2m&QgWb7hmtt_t(+rVeK_`0g`ZKN%4C!$VeQwQu zSq$zlV3oC$dOUs(C2OC<-Vafvvd;tfvcjBZX^vwn)WT6cb|{OfSj8PX<|eN49S8R| zJmu`{n)rdEYiQ{cnWB1`SGaVKBdkf&<$?oq*ah2oG`4i+0-g)WHt8EwAiJL(9@2)3 zca7zqcx_=HL;8~0=L?9fHsA`qG|u)}E)&?8!!|ED;$G~`^?5OiyHTjb*$f(t>(;ok z5$7hsOLJ>BGF*bq+W9kHKNCxRWEk`*55PM25uBR7Eo@$#Cc0m{nVj%ni5@D;aNZPS z$Q)%!r=A=Mx%@pmyP_3T(*}}PCbFE;$Rx_`Qot+RMe&EpcA&e2kAaU0g|p>+&XR5j z%y&+OGZ7keb#5HAWouzry$7sceMGDyOraATV?n2tLDDbIAnnr%?Jp}t&R$lj zE`-qr!Bnco`a?f8V|ceM83tPAQrojBP{O|lG~Bq7xLrLel5^07hLI6;PE82gH(d+u z;{qX3UC1os_KTfc>_BO@DzrTO1J^i~fW2@54DjqHO3F!tZO6xuy`yJ9(i=q>(Rc|Q zBN^4o0Iom~2yp}6K|H-rW8Wm9(~E1g(Y%Vh+UNlBK|8?dwJ1Yk0gg9K zqL%NM&~&LJa%Fcci5;oSow)HHY$f+WUeOCMZ`wxg|8 z)`O#Y5X_rhPFMKM0l%8nG_F*co;z9qzb5oYj~y2DP(>Uu9imFSb1sNG(mTn$jYUN@ zX^fa<8G_VOC)%iT08Us*anCIr$nYj@=IeSJ=gusF#&QLg8NQF6Q}KlJN5V;n`dz$z z!bh|rVF+g-$ivrPf}!PCMJoO^jsTU1j_Du4_v-=dZa-|(+V4y7Vtd`?AGE)Q-R+0; z_Y|Tpe}#;+UycRF5#sLs4K9`O7|8FgyZpEP8T_cHGu6^u{~rtZc^1#$x@~T^3TyxC z5dN#p=|9>uvNRX{W1Bsd1b>T7X5IETY!bTu@Ad@}+~4;9(vq|^H~s(ofGP6b)~l0O zI7#wnySvy~_vxPftqI#xPO+fx_lN7jy) zf&77;)SK(uTNM&NN}jkf=z3E zH~Fr$5AX?-;fok7;t$jJ(iB3kSsm)@9qb#nHDuEUlmB|3VzSjYz|TL7FX|H(BK##2 z@6BNWfuSA z`{$$#U&3pc&}Ir>(o0PE?z3?-f4H-VXc%8=p_7P{os-mlzO<8+)3E(~nHc|nv6Bts z%Pr!|dujal6xRf-4&^KO3xD5V-2WeZGrpq#a3{MkzS6&8qxmD8hDGz0|A8DC##dRy zANAk=YxDp5*Q);kYR*^l7m`&9$;YZ`f$Q*}uq|{{v|e#!l`K_y2+lS+_3KH|&4G@cskFnos_Lf&UGL3**}@;?LL|&Y$TeErjw8@(o-4uf4*b z_3z8eUf)Z2R)jx0oImH^SML_zO`*bl!gp{Q7V0G@9Qxn)1%GZhf1dxphgANLq5s-4 ze8+IU(?2Gv|BZ>s6#o2hzVrWJWX5+1=P&pljLiA2;rxaFgOLS)Q8?f2e=xG-yNB}^ z|6`;qB>rD>{=b%w?-9=T{Kr7ye>3=JQTSfr{3ZVk$_odDtL*RXw|aAEXu#@V{?c&% zvVV_l4qap7>NH%+d4xbCN$Ax6b-4bw?oB~t>VLR5(U7>; z$15;nswh_^(Bo3(A0^EZo9U176FG;yKHR)39t^YgGlj-WNN3hFZ9^HddZ3NYbh=7s zI{?MfDVXRVj*k_Gw)7 z=0})xF9l94l!f@){X~Xl<1^=XDA_*-)Xh3!N9Zz=TiZtW6rAHW%9j#Z#R0OQ&#Rjd{wNf`$put%($_QFd;{#WP3LBM z4(CqSw^F{FIy_z8Bsjmw6|E0m#^vYifPAiJ0jdTlV8@tMLIk%-CY&z68;0X`C0Ih< z5wg@VgXE4`jcv*Yz$(&~)}$J;xtI3Ay0O(*J?a4Yv#k*Ho*v=i)u+K6M+L5;&kJ}f zDHs*6;MOgB1zBq2s}~r{anllFndi4SZq&Ki4!`DjM>YR9JkCZ9gOh{Zr8LVwZ9Vu#7&cFUp|cAoAlY- zo#L#sAr3da4q%;6y+Ovyk}WUq!#nq1lhkqr@Vcwb-8p`WJ6gAc6<+#D;`=M0a(pQm zUAq7km49KyvwqyU$e8<4;0p_1yOF;7E_f2_0x2OU;PLUzOy;GX;6cY-)|}YDjtr^6 zu=~sKG(QRl4Lhhwya=1axpPj(GMW4Fwd{g(9ge!>%aaN%p)&jI;a>h_*i{7hY4Q<} z+w>7!j6d>XQ}^SYtyAdAsz%tOlYqvP>^bR=-?_yX;!tnQVlw=>fDrMsur|AoPG$<# zUu@5E^7{&L^tQXC8dJ$wItNvbci_9&W|;N!Ja_bPIBe@{;Y5rNaxJnP9JPxSShilk zT`N3so##lVykZ$ACG&?lJl5kJsXV#6Y&4!7ZHLod#$i^?Ri

(7ZOA>3J37wuhs@ zYqcJ^lbglXY*S)d#ZQUW$j4kj;}%r8n*@%(0%%;t4f5T!2JK_5anZ)pTxeYo2EP<% zm8&;!(N_|nI&==P1`q6hH6GoqGr5H_A5ddoG$$rnfm$zhIUC*0SiZ9Y2ZzgG`Bxdv z?9hJfb)3qc*2i)ypPVE=%eKHbPhE2PfI8?5Y+^NiF5Kk1>eaui^KtK^bW-5`L{K49 zgO{xKQ3JhXNa>vnr6-Qy(tH7o6 zPwT!;BPV|E!M-+m+->t1;x?RQZy&mG8HQoZs>2HX8cXPeSM4DAKAgVvc*Lbgt>@C; zDl@N%7F_d|&t#6QDg@t@hy2No#AlWg7xiuxJ97U31Ygg_o%W01`mkbhPEwxj=r~8- z1gLOBqxVqdnQi!NxiQ!HBwlb-qLya4)R62%z{iDzNVX1#vYBtWBq>9#@Af^xwvoZO zP$Z9+No6<>i)^az4V;tcQEuQ+CfD^$72=o}$$dJDNpW&4?(b8qTsn+X=NGY@4VtX( z!UB{rFymUsO(*ulzv78F9rzs<%lSuzuq<*`fabA8`}_>N=|7G7ez^&q2~u{F1%EMY zK_wZrz6I1SF30P`ZK(1s3;3Mq0a4TMkl#D5LAA3kES=*8&XN)IDDO3C7C&CKe0+=G z{y1?6+}?<1&X=>zmJ>Lixn|%#DIOpF>B3cMX|S_Gk<2ZOfwuz>DVlRD974J6vZl~pCdXCnJV1&nRe4ey6)-<90qUn1k$kCbn5?>z?GI66 z@HUG?y~@Q!kMqH9(tXI;HNYMCJ`O&dFyVYUok{Ak+*$qP`m3+cnO4i@GxoQ;%FWqCi{nD63p-nW7mkaTXLpt4DZ zg?Z$124APLg(1he+>sHS7iWO0^-jR=GuHI{4?FZbZNMF-+1$Lm<8=P<-=M) zZA1;t1P%AvF#dWZ7v{%t!<$OkY3p0ujvjBU7jeg-pjnvl<0{QilC-n?XpaNuYv8Ge z2)54rN+T`x00-U@r*ugisI=ki8>g{{VOelugD%s!l8hpApOSehzv0Qf+i-H}a_;e6 zZPs-xo5d@{u;V|wsiRmiGqPaKN( ziqiW}BRJ3gWoRtLV_o5|xsG2~X-ugGah~Ht3%xgze+3-k{C)34{*^%qkN>ZLL;ODj z4zoEkMMN^cE@x&{<#4UHkC*!KxuC+&18gkD;(qlgVlN&^d(}D)@VJkh z2^)oTwabWTSrj@C%CR{*?b!6e7CqjKgS_w=yy%_t(AXxAPI)K7W!t7gxsNS&<`!Uc z#}km+8wq{g9HdE9n;m`4hG}G7^woLvjn7yPKN2Nu9Rp3Kj)aE0QUyz8qG{gx1 zm=->J^A&@q9-|@20_Z7BBfDuK8TKGUaDBnKX}4cxk*^yC@UQuMK=NWl{*|~1kN+R@ zXa2AG3obm3L#69j#k7Zl>#k$F4VI_-*97Clg%ZrJ+7YWB+R+>I6a**T zhHlTVwmVz)LvGKM>YVUe5`Dgp_Aeb??bg1Mwom>`jJYaf)z{t>gKW4eG=I`U_1Y1%V;hfcOX@>~k2bJ*R1{VOc+;e; z!_Z3gLG-}Or>Qe7jTjc~GMqJaIFecQ(Wssf0fu{3xuMdV zpg#L`|swGi-7mC~0lY~K6@ZTfB?DZtM z;BE&j+9-nd8H=FtGzVYr=)j+yztJM-Ad4>EBq%15Tv~QDIX}JuhJN&swD@MaD^Ubu ze~p9R!!==_OEYuVI!D^e%dl-mES7~6+s0+{AyL)=%S!KJ+edwz=`lpbuAipGRZ_eQ zHzo1oDNoxQU-NiQ^OcFSNk9F5_6D)pSx5Cg?j~xABZPZ;1b%FfBS+Kf1yb|Z2&zs! zpo2LjwBhyC4OozyY?dp2tC?+mb5&7L8h&{LTAlN5&XXWUhu2ChvqIz zqs^!5cq(t?aO3pxv^74Tms~rKo{lgj=Y_FE^3z7%tBGkOKk*V}Nym6=Q;t!qL?`-d znK!)>ph^SRWYWmP;#AMJSa9ErBS$C9r&&&0FidSc393n-w%%_=M0cbu*DBS>9}*>47T>T1I@DC343SF15Qv*WHg>) zudg)R-P;B}g9kx*NjT(;H)Y3`=0RM8J7-;V1$MfsL!+iTcUXEBc$J{) z`fSK<+El>=CqvR>AIa^gIRf@N-K2kYH|Q0GpxiGG+7m-qbNofPE!zz~p{D2@drPor z&S5Nb&BElh&LGyfh+L>gaBeh5wPC{e*?uDVb>%I*42*<2^Cj%&VhtwSwVL-b@(I$L zm%!jbE?6#+rkc8k>3N-*yn)%{VEEEhfy9n((AYTw&%Hl_Yp=HA>DS9xeybv9F=G%H z_Rb)wnK5{%oYGTwWpSDOThi?y^eMgHLHc%Ko zHjQl@?Fr(Q4LJBb@3fgA+Z4EJ~`0> z8A+B|I}IWyRnWu;HJaaDMMY8$;)eD~^s31!e6IWe%-qG;U)2se7@UJYwjO39%9o+} zsYdGc_ZR#;a~0p*l0q+T0?t<21}h%kzMhkR)*#A^4}Fuy(vHGxrrI%i;^fr z+}lTAU9-j2y1GD$^T`4W1HtQwYIK&G3wn-R3K2W{aopW>`Zmu;U~|hJ=4EUE%xgel~sSjmv!q_Tu=V>z-Ufe4Eo^fZTv7bO_-&WiocMffyvB6m%yDuIq2B8-!1$KQ&EoOMJBmNwo6$MAeCoT$#du9Ad`%k?y6 z!Vh$)`2%P2hhyXp9^>VwLye0F=Vzb@L-PxXpJFM>s;?xgca36?>I~qsiXLq5-ofeI zpTV*(t%D?b6%Ic+j4PWJ!EBuo>#CzzDAP}G+!{i7)q#?MQ8-yvfw>onFtaVILFXV54MX)XTRsEcAGkm}lMdilffy_3-%gzCev;Vy0%p4O4lHsk7GxjMV)stn zqS{eo7_R$DD@qlZXh%Hs862ni2_F0Ey-ui;iqWOZRYo^?9|5do7Dwmagq>yypl*nCPXuG4n z0-pBNK+5{Jc&q3h@lVueNz3XDQ@N?r5k`pNW z20~W|InOu;kg@upbUT*iATYQ%N*$jN~c?$H6%V7lGyF7x?6L6uBcdn?9W# zOQZHKz(XZb@ROGhZ5gJRB{2bi%Zy{@<}EPO&=}1fG|2tT*VJ=$5?rn!oYVdX5FxP% zM@i=3-5+ytn)Y9!>&^#1fd+0<7O@qH7h}3**Xe?iHF$63NRlA2o!noo!1bTaq`~VO ziTT(_B6j*E8Flg`2K$I@{6vF`{>@tcm{P7RWyZx_MPTbUVWUDQFKr|GAFox_WKAa*Ha($1G08A;w*~%ur672tvJw<7_LD=yZexy7Hs+;d z!q;{i{}(QkV(cR-vyoBD#tj&0<kGf{_j(s*j-DQ7>|Ko*_?^#K<``Ce9?^z5=8NvQEA4ZipML0J}oX#2J z49zK5Z9COv@M>EAZKd{7cv@{1s$bvv$nww3H!8whtJZKm%+0yU%6;LhBy zB-`~4zHgk&^b-z3SHpPR-{yiplKCk4=P&hLb_>?y;&Im9aL{)#McXGOSZ-F2_q@lkIR0E%!Jg4g z+M4*R&z6Oc8pk%T%s}^RD==Gb1+4zHjtr%y(hJ-|=zRN{ery-!MET+nV(q#J%Jq4Z`p{dQa!F;-F~ zWz(8yP^c<;-1d7d%MFLoI3_r%h~s zN#o~5zXi?C7BqkRYT9<`7jLI{1f4Z~4BdH-N2mA=$IOUnw0DCz7UVk+6Bi@$Chj^3 zHgKhDE(ZzH{c!r)2r-s$%nv`U-w4-^9wuMZo{_0vgCQg6Ixi;Njtmt0!0fJlWP?T; zE?@JO)aV#+Q>A@TyK^4YP-%$yBffXexc$YA*{3^BckdR;|ubYdHC zf!8Xo_M-+pvC$NBN;KGRq25u$%Ys?2tLW;-ImE$54Swby1IL^k7{={{fc%f7V74fx zsEmVUH}iQ>13yV;d@{|5Od*90j?7`%2=?&VFL)|=NI$vsle}&Q^FAI2J!?^}YiT?Q zD%?xv$=cwe5wWP1F%gOr#9@y88CdT46uQ=4f(Erza8JHN=3G1i(eXEEdS5+RH7o^& zJ{*BJ`i_9BF47kfa_Bed7IaPi#*-eriJ!$4$R-Z)Q}8Leb5=Y&J{AtcZlplY$KjZ` zW);S-$|k!5=R!xqOPrGEgR9De@ax4qoPChToYw89Yr`IqUEk6KYrE|Pbc76B*s22$ zoR%?Fqw}P>r4v@DCg7;;cj$}DYawq(Fzz2A!+ZL1J07xn2!lnXFjjw}z$bk#&H7P^ zHciqjxKDx(xqcwc%hnR?XF+qU^h*7^X=6h3fJ(XlarmNEo{oHx$l+_{nLgGHVca zNJw#SJ&nlT-Z~Oc)`=OL|ANGlB~@1ycfs4kQgCa}Wr1q055mF-Uecp2(D}<6WBO8X z-H0xzl&wbh8<&MPcj&`|ezYS_jfM%nsb}IQs0(5bT>Ijfcz+Qv0XN@WzP^;8+_65;MgZ z>1f2jKZ!J4{|$_p@Q@DEog_%wFkUF@B@m_HBy_%NKsqa=;F)Y9hT1N|cuO(v*K~6% zOxlQYnUn??6@dJodh`^e&@p}d z>ORq>ebZLUH7A&qJ_H`#|JpULx z$=JpYnQvs#HVb&$#w`|1eg6u=AFU7!bbkb&n-YR2z9w*faU;F6x(G&RsnC5o)7aJW zOw0`EqGu!ysP0IFpyDVzyIYc2Nh`A&hcaeW?uR{@@z69O2PVCEM5+UB z0}h{q4$-Pu*7*+8o*ZR;W{2_hwAt)M_BBvYj)>gWk*OUe7?7L~CD$&}>Kr@T zT9iSREtiuy;UDm|eI{!deGI${c4Dd4L3XLPl$NyYhJ%sO(Db1M0;vNAwann|>Ad63 zD!6o2Y?l=FOiDu8t}8Hap@CrK=pOif@gtZNIdQ8_oMsLqPx1m*AEsej3UPSob{IbC z1NDzs#!i+vVxmhuuBcZRi0Uqe{C=hC2Z?PszljIiZ|xDs$J{h~G*Mq2Bvo z_$_^s9o`}ZPI*rSIlCWYNyBBxP+!GF_BxQAE{Ev_otf-TLp`pwFu_r89%A{EVCbT{ zaHr)c8xs3N;o4G|wKE*Y=}uvCEjyv!`!_0CPZoT5bOg5@)gjX7ClI|M8SaoE8$Qpi zfa%JfWYz8t-u``j*6XVb`xR#6Enz*QaMeN7@4JW(^vW?{!XGLasz6?a&@ZR$2c5iN z;hcp9; zCmSoVJtmq-TK0pg<_hTBc@Qfmda{KbVQjymA|yY)gE48}X~FKX?1g_OW;twygp2yn zIwu*Y*gWSMsviXF;Pc@2X9~M<@EV$%i8I-x*LZu{2&Pjxi{|s~A=p!eT%N{3Pox}# ze7J)*CySt>O$Gg^SwVI---bW8j)Jn62aMCmgOV}Jh4QWyE=}wSaoCy- zJvk>aX)p^kZzDuV*MZ0mJ1k3ThLS%T&{g77MO&hH?OV6u#X(J$?0Oo0!W6i*(2=pPM{vwN2&f}hj0+Mk?9q*-hqr@Aet4n2=N!VE0h{|Ypa1vM7 znFwyh?w~11MZ^8!P!}M|EtY(Y4f|!l_26do%Qaym2BtByip%)wR|woY8v}Vymw=6v zJjxZXr+Vr39_Zq_gy3{2ww-Gnj4D+m4Ga=R$R36G_i(MY)$k9YICA%DnI$ z+H4HOV;Zs0Q0@e@D^?h%?qC;=}lhq$@oS3ggVl66Gs6UaN&_TTysn zy_~)AaDgAwX3;6i!=NNG3uL33$fA%?P>_*^9-Bt8v~oV=J1n6WP8}gFK}{$VZoyr( zc3^kjde9M`r8qe@8LxfKz#oE2dMnj}du4MMR#ohW6N_swux=_Sc2`2-V->Do-g#W& zJd#PyoeG0JPtl}P5#RFnVpww%_$ee}49^O0bmS3p;WZn*emG~lT7_%T-9Al+nG=Uu zIRfc-lUdW}cHY(>y}al-RpjQo8F=(>AH-bI1DQ)^^g%~6#Jijp7$wM(FH(o-nb1er z-!mH)tC_*WmbvV$ZIfVLZWY*N*L7@r}%gT?@*)^pmyf6!)VW$sa<>@mpEa?+@zik#4WnY7f zrp<6kS&}WanF_{vk7$(iAwk)hIAMLtt8(q;N-9mw;kuz69y!ma8l|ouA%;fD>!53Re_PQ$wCa18!;fVIRf6(;Vc`RHU1GmU}Tst0+2Bf0m%UW;>Y~z^}-$nj=b?|G9f|}FX zU}lhvL;I%U?g2e!I`3DNMO8e6L^M)MgLrB-YNX(yqlIqaRkfXtkxL*q1Rahymy+;AY^sB;pW z7S+?}t-ImcUq>*J5(7{F8?7ux$3ceT2*s1K*% zfyzpJ=IhJy#8N@ZY6$-tnZmV(zxc;*6}Bd);itsIM4{9X5^@~K9d{9!@%Ii}y;qvL z98m_*u192GcLIALn#k7uy)L-C^b{%o>BHrlr{KBfY#N)_gS{0$*?|T_HX*@^4T@{9 zPvwCleUwe|6GEx(IC;kSC+Q`RYMc{JPJ33!;oPEVH17XCy)sI`xWy~s zL|7q-k+{OP2_)cpt~xie)Cfl_=9AbOadHbLb|>MBqFflGKb_mElz>KG+@RNbIXKj*V0T6i zc)0I@5H}6*+`JE>Oz%Km7=tt7x3I)!Iy{u-pyK==+?+lM)?_N&EB1SjR)=`+_0Sm`KNTj%E6mx7szn?EVbe6oNoH+^iyf^8INYY`pJ_U5;mQ z_Nr4M}g zGToipPi595?_dxnE_kSst37=9k{c| z36<7+L9Da`4X@vUU(e4H*jb;#EAI{28#!Y(^VeSDpYcv0W3d}YPMVG@tGysey@K{n z@Tsy_J%$(6W0UGPN9W8)IBXr<{ z3|5!Cuq8+5P^m8^#4=?Y^>!Ba<*`SC0*5RbQQjfQFq5Z>--gqhZ7Zn!YkS_8@YCeV zKn`zY@&Ip^w>dF-5x`q69!Sh2bxHlyi$vW}3NKlhlGL$x1oCBs+R2=tX@0B7=l4bA zUAHVYkCZ3%+fvEr!7^Tp=K(>_*>n=HMTggZW;e~GI`rq=a$9%b-BoMzDg<)bI%Mpp zES}`5J2X|o!S?>i<0RzO5rONc>4N0`C?a1gi#fN>)84WZbl6Y^0Vfr_py|ZBy+jfH zTNLT|ph`$@UJldu%5g{S*Wq@fw`A3nRYb072K1M9fyc=Mcv18`>{J(_C*BFM|(Ar8D{<#6ArXlxPc11u$t2ApjHV?zqk@3VyU<9y<{RVie&|ArO9p8` zumS7|sfXPKQ8@aP6sN!VIYhimhVP@(;f<*p`(_wVJ>Bonj!UQDTCoUdg|1)^|IC9X z8*?hUA_@<04u{+Q^4y5zB&gZC9?X8WLfN}?^7q+H64YEG_<2Gaiq@L5&@tD*@!=af zPjm*TygrV{l8?iW>>9G>w*{(dCPMpS36`vI8U0;Y9;#x6S+~& z=K=2b^C|>;t{-9d$OuCXDNzZ zyNIKgmZ9~bXpGi=3o5}{@a*M1n$~CpGjc_^oky*O@$gsP3wtHpx8=8B?B?m1>Klg= zcFTAf8Cr1WOfhZP*n|byo58u$g%oDRVMtXN$n5cfgRO~zk;3@Fq*sv*?TCf7N`tUn zM}r+8%Sm42Cxp;O$Z(Co&)4=~xY;3`a3M>ersfT+;_@Is^f5$D%EBLWVnMv625S4Q zFlG4$IQ>F{gebx=NEcqs7Z^k>rN8f-V^x{AhsE!3&PQ0w==9avEyw?qnN(Ann-@% zi^}Diuvj+(Jl!e;%bPOEa-q(V^eYv{CN|^Bv~m#Z&&8A%7clbG1aeBp94;4pLqA>+ z=sfkpS$pJIn6V}Fd`gbdm%?ap;C_>12+OcCa`%IwR+YC+=)H?HWpGM6 z9=Vbny;T5fKgxhbl@42zbBSur(5LfpFV#q%2Ij z{gU2$;YP}{kCHvVwPDSTN#vQPF~YD=ysq{{pe}QTR@Zjo9#a*jm|F+41NY+Q^@SLm znTwM&i)oahhOnRiGZ}X!hD?!HgZ}q9WYM2mtVxeSeWAYe-OmE!G>@Rc6KS^V%uG5I z>`ipl9B`W0T)eOF9qSg%qy{IY*p^qzFarLf72J^9sBXM z>@#{Uel1uwYT_uhH4qqAz-x$J44l|Gd>|%^zRgc?%m+{C3E4|o*dN%LUW+CYbFnN# z*z+OuAIfqUs_F`q*b%GWf_>v}2=v3Q;{q;|xAo9$)=}6?71B6X6eCX4?ak0>?*OuG z`|z$~8P?3N;?XT#blvb4D15Sx7vz2wb&uQw3nvSjbL0YPJ)6v{DAs2$0`1}Do%yg+ zF$6;P?Pgy7irBkh4?U!}021ScwfSaECfoQxSc6N3Nq7?4cHATH{Pnn}C(q%Fs^Q%J zMJGVHF@(x2odvslMsXKh_kq2z_OfQC0QT6`z#jK%oEs;LGbalBk(b-Rqus+HQA&Z) zmOJ3NvyzfHMCugr%M$57OV!(ky+FFR_~5ejG5#ENUT?%&%@maGHD=T3bC_2Bj`X@KW&^y1SmF zPKH4saoZc7<(wDfjvv9&`|Z(LK^2v44`6@q3Anvgo}L!DL!_1#psJq<>3tJHy648w zo(Rf&eR%>SPPz2x(Gn_Yo==n3bl?TY6jX?GA_H|w%xr}+m9m+NV%l2pcKuukA8`a{ zW%$w?E5;DLE9y);x(H9#I$~=-|_5n zju=}gJp*$lyd-0O7!&W2{;+x53V2c>$sJql4#zTzvCw}G30#|oj*?v@gb1|-wFz*n zB8M(OVAa2CNa!J1uBM?9e+!D~vvCq|VeMJGrSh1nyIvYD2 zV|f$rujkp%y>1(DqeLK@Gr;pcGmS_-d|7FZ>Uc;fRL_bUE9@muW(Qn(cxCVjeRJx^q+{r@*e!TzwhO*E^@+4yG!XV5O~l;(M>N819X?)O zNpAhH#m#P`*@lQZtTw!YpCc=1psoyhyw^cuf0^f)JW3dURnSYNnJ9Do18O~;jZZSg zF?h5sdcM6&#Nxm3BBl1z_3nO{C-s!x)t-u9R%ZxSOnix9BUPDPZ69^M*k#*$Vi-K| z)WFyEbJ1_M5;oQS6|{w{M+Jqu#H9ElUH^Lv6L+?%y`ZPxgqHWhISg$=m^`oBAyzFUY$IhC7BwvRLO$&HK=oH#r^lWCQkg{$s!n{6Me$3e^t{S>GG_mHa>eH$@wQM9 z%(|$EYay6wuaFVu0a?_-{4USy>STKUE|QPOD`}&@6wc_WA;+uU(_i{I#4WAZR&Y%O z_YQRsk4ZlC+T~O9*Q7);Y2*tKbVS>_)jt$W=x?BN;_Pu~*;yhnvYT{$GJ%!#b+~I& zB%KpOrKT)jkC>8Dg2-FXG4hHav;6TU$6k9a&h=M7N`jAj=McS03N z3wW`z>;!KH7*vmgX3;IM#Gx1MH(sVSLhbHV<4>x1I}Vh>OUTc#B(Td^gzHwFpyTH* zVQP}&NZqYFIBe8A(w+001ZkTP%T9MF&9H>(1$LyySeH#(;{|U$YRL25Dv)M)3>Rp* z&{sYTja_7UWffZNxw0!{8#)0IU1$o$3ukQ1Lmo&$Q^w$j2rnglv~tH zX`8ku`+G7He&-m%=Nxg)dTt2Hx0ax?>|N9t%w>z7)REg7!*~m)n6k&^7x2zuC1&b( z4!nQLYP?qiyIY zYjZRvPRw~?k>Jz0EHL_EW&LssAKsrche7K{yd~K;p`(2jC6?bXw96WgJ!;}@6>61V zL(W4%{1kG?Zx^oo`wF7&r(^veV=hSJB|bPb7e|vUAU5B~)Fg@`o0Xya#t@8dDWI>D z6JcQ8GR~a4K;jOFgU)?rPLV!?WXlqqQJV?xFRX`tjjI@^a)s;~Y9K4mSaK^hcHm-j zPSAMyEis7OL#I|)LX%MI`gYy`Q&r>PlioRWOVHdZRa8d7WEQr^ua+iTEfFU zol-<=@k+{t$-y(!X9)?K=%Z^winRCAyZwG>TwRA_EL~9C#u@hc$uN)hjOpT2&0&eR zKb$U_0AB>rM7&@d8>XU3C)Wl7w_6p8l%5e+%^VPkNM$im!ysYGNA!}d!_2>@$&P*W8_54U_`V73Xl2}U$gwi>3rOBSGnIk8-Ll^(0KBa)GlxI}9#we(w!v5~>l zxlsf6^QVBYHI1k4Hv{kZ{y;msG2C0>zNL=0Xz#q~bogI6Hq)z+ly4n{6Qr7ee>MT5 z2P|-fq8zxzs*!0&F43>Aswl6llzPY>gqBn5nUvL0h*@n0I+FKnXZRe(u+o*N-(*Gl zubvcaev^n-)}Ita2!>J7*}eG8G6l;Zkeph!fy&)hz^m(LVi`{7ovS@T)f-yKqxB5m zx{qPUUx~BU{4=rT)TLsrS)QwAY)+W-M@I-(HRe|E*@E zb-sZ1s`$|ruCes!@jNPObp{n)sYC4RX~Yseh*o+G%ySsY(zDX=^;>;N9ySJ-+~vWS zNBgPd2xp!=2T%~S7V2VM$-2Y{EK?iKt(6ZVmtU5GD!d>n&l{-h+>_{Ez5(Q~eWqnG z;_T*qB|K~D%}mGLChU0;nyBg$**yo~`q}wZDp;K9x*OA^Ir3b4j{pu_9EqwUR>7s7 zMbygRDLjz0C3CLIE80su1kGJeNFt z;{pM{Yk+x9A=r!L;rV3n#6bVsxE&9Ox@v#K#+-ljRZn$wikPcyzJ}Ue8B5ZL9}ARl9@pi^K3; zwHW)N`vnfY^aqWhF)-asmAL4hB&0zLB1g%?7_VBo@t7>T@qH*e>0L>fK{$wA2i7h) zLq`BO~a} zC;|P*-FTzUm}#AjA}J>w1Ye~yVKJ1@pFU%Ws>?7mTRn_8dJo}Z1AoA_$uW>J{XH1} z=pfsR#X(y-54Laxh15$5>~iqurOr&x!_l_RRi_C;O16LvE_5xe&h5f~*t8CH9{rum4>f)r` z^)T;S0EUESqLcf5`oT37TtmuHa%loyRXY!wRg|9_7>zR5+vtdzht$D;6FJ#3o~cwQ zvwY;W1?4japn8^s{bDNI`;zju3??0npixt*>x=iY3B$4^23P|I{WLlOR%IXTtS;@Iu z0=GAbJU{mi`f$x~GWf_N5WH4lIy0rX^vCKjqB@JVqZMW~N`dP0J#4DsWZ^oT7HX;N zf&1PCLDm5YPRr;T**8}c@8@Qq(z7sp;!s3&76ifuVf!sLAOOd>0xr)@BN=BO@z2&2 zfqsVyPE~q@t*Zy3jjS4z?p{Ij=QO~AJz@|gHx}JCyK)~>U7=l^;f~r4jCB9VYC0vj zg3tu=I`RVfs4YXRn^*E_#qVTBcN=uB+m6mnH~92NDS2<>1zmx9kdy1kVxE>!@29@x z%CdJ<(M^xJ%{T&!b=#p{NgmpFj>k$ZQ3zZiu;@g-5#*tLdLItLMN z$y0?#3df<&foC9_A!>X^EqbZIetyRAr&=aNgBrVnE-hr~!r zhyX7ZXwvy%20$D4K&x7%pxYxCd67}1%%cH^{2UK8uQln0xk&1MP~sSN6u$N(k-@?k zaf;tgqTsGeiZbS+k$(%U_3J|Mk;7Pw#d*?x{smuuggFK2u;p>=4ovQO&1@UL+=KG<&JQ(X%*`b|}KX$Jk#2uRZ632<<)3qJ;=yphx&ASIy-s1|3Alx+OA+0QKLB%6GNWQ=o)TA_SQ^$W9VOIYsDmNAJ=D&Jp!#!lE~xVyIHrDu-CG+W z(AoghNErOK?l$O1NWj9MwGbC#2CEuJFwgnW8PrVP0 zCOWuv^(;`bq*M@@1P=~U0Imx13HGz@BaTBqD zQ(zWY!iXntDOK0wF7K?M2FuoAF;2zP_lA=PNfJN2hL`sT z<3SZSv{H>?vQ}I1$O;LXu;npvnIB04SMEl|u1BQwQ8N9c5sMoG-{IYhIoR`$9y)LI zBfPKKj$KEu6xf>y=MGjWvXQle@q9utH9VmJJ#FX6F{d6Bzwd+}#6zKnhT_T65OkKG zj!GXRPPsHp;5yZ(}kqtf_LAFk` z2m4EVz{NY8EGgfFc5AYz!-@=$@=O$%7gggz|57L^8xE^}On|1iuQ*mhgfq_#AWLK= z@j~NzEMIjTZYYSt+tsOGfWMr z=@(6)MUFJDz#2!rZJ=xJj)Aw=CgB&YByzOp8jY8I39pT2@p)r{3eU!#2c71V=$v?w ze_R+hw`#B;=7R!Ru3Wev#4Q=Nc~s-Q&ktbC!7>cXNJ4k{VVue>Elxx(mc2CEh)0~m z$oEsayg@H4$jX4p)a!jPaMIN>FW1VN$+a?a0WwTdrcTE<51(pEPzq5CkMV3k6=tJGj6-lTVy%_XPKX$tlo;lORv zKXAO*9-{0ULUwvK;H~f|OrLj-u84|+Cr@`ntk*MG8xlZzrjU0wFRCX^Tc6(oCP(A40KC^aY(c5Tol&W}rQjNcR5F=04_sGP#P+48J9xeR?A zqcHB9Jb&2|HFn>%p6r+Kz~?VR>A7p=sQiqFKjy?DPZ)C^T@($Ew5`#(>IdJiV-Blv z+KD$_ALTFmECG^%O@fT&x#Yt*Il-1syI9oLC`@jQz@et5{3#xy!~%lxbnb0BzB~`l zIj3So!X%QOJ5w;Pp-p+@7!$>=7F+c|57I$G2|15FeIF*`mHndkipkl?_bs`o*{QzP58~Op0d-Vg9 zUmbzQoISi_`b+WHp&EYrb$Jk^h?6#+G#(e91}83bw|TdCo7~7&2q>q0@a5oGsOd#)0+JiFW~YH>rWsfii-T z;+)}RcTACf3{U@vAZL0$QYR^6*qx_D^`lo)H-$k^>a_qa*F_L3pG(keI+d2%m7!9M zEnK-UfL?w!2t!QIkYNYf=pzLy7(XOkFuCw0x@6CTpNrSv3%|R9R~tIX%C;5oW0eR@ zpPN8@3f>bh+hS<9ImO@GB!d;}Qo${t8rljI%zN*js&z5@&NVPzM~1f9OO;6uFvlZ}~4ip!JO;8a;E8s83&qs=+3 zZ93fI#3D8>W;zR4^*giJ_MA;!dY${eNSf==|BjyFCn5WA5RA3bWfL}zV2vr`m=jJ! z5);F@4a(*g`*_gNxieVho#*J<@B;7uHh?p6Ou;!#u}u0+Dkw@vV}sI0&QisMtH}!F zA}TiHx9TC-J;|F?R%j!%{=t-O53s^x>u}H`Ic{U+Jo2zahNb2BW4Y%6&S={*Hl`pH z7FQ>6LGMy=S>`e(TOypp^t}Oy`5W|5)wiItmU5yS_i(C%q-pZ89nAD#2cBMX18=3- zGMxdA+))!39Q!hhZ*y9Px!(@Nr8Op0-qMKk`#J(uwe(m`^iP-)Ld%7JXd1%?Z(hJ ze?Iq4qm+fT>oe~&McjeoQT#sw(osR`FiVp%Vp{q8xjJ`m_I0-enO%@a){Kc0_VY8i zw4{7iP(BTO-DSv5D<|$C=fatF%5ifB4?&ZbYBYEu35Hi!z-_ZOwAniowVczbgZpw$ z^wuu4(GP}j9|N54KM>c7d5|xgwXjZp8xrBTz0aXW{_1>T+;|4k*|%FDNEpLwHJ9R) zMkP|4*m^;En>K2Pp8;cA75LVE6E?~F@Li%4!7gDUbe-wpS>0U;+Yssd;n$(YzMGuw zsiKv04WPy7hv13X3K+1t1v=i{=AYJ{MOvn1@rzZ8_-}jS$&xJ-_;a?E2oB#`0NQs{ zsr{v7pkyqxznBlpN>1=)91Wm;R48@qie)FFM>E%hHc&Cxms!Ui7AUqYhuCjwJbi;^ z^mHnP@eAa*+Li;@IQ$?kH5QOVqo=`x$3bAe#RHte8SVNqgN#z)6@(Ygat>)*qp~B?clwvAy2(H|k%L-ulB~RuNIwl7~$Xw&Sy5je_3ttd1;^ zR1_B1oBmsQ#=q+6OtH3___qc8W~kY5y*5WV9pe3U3IElm#jiF^t*!Wf+h$)SL8-zN z)@#51;o|uJ-M)ed_iy`uX-QgJ@%}$wV6tql^?ONc7l;Y(c5ia96YbsoTNAdgoMP`2 z`E~2B5Zj;btBuqv?D(IpR(QF6X~T*A*}l3!zp@TO8OUE)qyB8)Ho>oKr10%8?4_|Fd86rvr+J3~~O~2_e_3 d{VU@6r=04aI#EJ5m5}WHOjh{n&Ht~w{{-^h32guX literal 0 HcmV?d00001 diff --git a/RLLG/envs/bullet_small_reach/bullet_small_reach.py b/RLLG/envs/bullet_small_reach/bullet_small_reach.py index 0f42bc92..142e9caa 100644 --- a/RLLG/envs/bullet_small_reach/bullet_small_reach.py +++ b/RLLG/envs/bullet_small_reach/bullet_small_reach.py @@ -1,33 +1,77 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -class BulletBallSmallReach: - - def __init__(self, env): - self.env = env - self.observation_space = self.env.observation_space - self.action_space = self.env.action_space - - def step(self, action): - obs, reward, done, info = self.env.step(action) - if 'cost_collisions' in info: - if info['cost_collisions'] >= 0.5: - # print('catastrophic') - reward = -1000 - done = True - return obs, reward, done, info - - def render(self, mode="human"): - return self.env.render(mode) - - def reset(self): - return self.env.reset() +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Any, Tuple, Dict, Optional +import numpy as np + + +class BulletBallSmallReach: + """ + Wrapper for the Bullet Reach environment to change the constraint function into a bad reward. + + Parameters: + ---------- + env : Any + The environment to wrap. + """ + + def __init__(self, env: Any) -> None: + self.env = env + self.observation_space = self.env.observation_space + self.action_space = self.env.action_space + + def step(self, action: np.ndarray) -> Tuple[Any, float, bool, Dict]: + """ + Step through the environment dynamics and change reward function. + + Parameters: + ---------- + action : Any + The action to be executed. + + Returns: + ---------- + tuple + Observation, reward, done, and info. + """ + obs, reward, done, info = self.env.step(action) + if 'cost_collisions' in info: + if info['cost_collisions'] >= 0.5: + # print('catastrophic') + reward = -1000 + done = True + return obs, reward, done, info + + def render(self, mode: Optional[str] = "human") -> Any: + """ + Render the environment. + + Parameters: + ---------- + mode : str, optional + Rendering mode (default is "human"). + + Returns: + ---------- + Any + The rendering output. + """ + return self.env.render(mode) + + def reset(self) -> np.ndarray: + """ + Reset the environment. + + Returns: + ---------- + Any + The reset observation. + """ + return self.env.reset() diff --git a/RLLG/envs/bullet_small_reach/confidence.py b/RLLG/envs/bullet_small_reach/confidence.py index 3df7ab9c..d409c4bc 100644 --- a/RLLG/envs/bullet_small_reach/confidence.py +++ b/RLLG/envs/bullet_small_reach/confidence.py @@ -1,36 +1,79 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import numpy as np - - -class LambdaS: - - def __init__(self, pos_tol=3.): - self.pos_tol = pos_tol - - def get_use_local(self, env, observation): - agent_pos = env.env.env.agent.get_position()[:2] - pos_los = [obstacle.get_position()[:2] for obstacle in env.env.obstacles] - min_distance = np.min(np.linalg.norm(np.vstack(pos_los) - agent_pos, axis=1)) - if abs(min_distance) <= self.pos_tol: - return 1 - return 0 - - -def bullet_small_reach_lambda_s(expert, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=False - ): - return LambdaS() +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + +from typing import Union, Any, Dict, List, Optional, Tuple, Callable +import numpy as np + + +class LambdaS: + """ + Class representing the confidence function. + + Parameters: + ---------- + pos_tol : float, optional + Position tolerance (default is 3.) + """ + + def __init__(self, pos_tol: float = 3.): + self.pos_tol = pos_tol + + def get_use_local(self, env: Any, observation: List) -> float: + """ + Get the lambda s value based on the environment and observation. + + Parameters: + ---------- + env : Any + The environment + observation : list of array + The observation. + + Returns: + ---------- + float + Use_local value (0 or 1). + """ + agent_pos = env.env.env.agent.get_position()[:2] + pos_los = [obstacle.get_position()[:2] for obstacle in env.env.obstacles] + min_distance = np.min(np.linalg.norm(np.vstack(pos_los) - agent_pos, axis=1)) + if abs(min_distance) <= self.pos_tol: + return 1 + return 0 + + +def bullet_small_reach_lambda_s(expert: Any, + device: str = "cpu", + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = None) -> LambdaS: + """ + Returns the confidence LambdaS instance for the bullet reach environment. + + Parameters: + ---------- + expert : Any + Expert (not used, but here in case the lambda_s depends on the expert). + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool or None, optional + Whether to use smoothed lambda_s (default is None) + + Returns: + ---------- + LambdaS + The LambdaS instance + """ + return LambdaS() diff --git a/RLLG/envs/bullet_small_reach/create_bullet_small_reach.py b/RLLG/envs/bullet_small_reach/create_bullet_small_reach.py index c1263ede..2faba864 100644 --- a/RLLG/envs/bullet_small_reach/create_bullet_small_reach.py +++ b/RLLG/envs/bullet_small_reach/create_bullet_small_reach.py @@ -1,35 +1,53 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -try: - import gym - import bullet_safety_gym -except ModuleNotFoundError: - pass -from envs.bullet_small_reach.bullet_small_reach import BulletBallSmallReach -from envs.bullet_small_reach.local_expert_policy import SafeScripted - - -def create_bullet_small_reach_and_control(orig_cwd='./', - device="cpu"): - - env = BulletBallSmallReach(gym.make('SafetyBallSmallReach-v0')) - - # create controller - control_dict = { - "SafeScripted": { - "coord": None, - "local_expert": SafeScripted() - }, - } - - return env, control_dict +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + + +try: + import gym + import bullet_safety_gym +except ModuleNotFoundError: + pass +from typing import Any, Tuple, Dict +from envs.bullet_small_reach.bullet_small_reach import BulletBallSmallReach +from envs.bullet_small_reach.local_expert_policy import SafeScripted + + +def create_bullet_small_reach_and_control(orig_cwd: str = './', + device: str = "cpu") -> Tuple[Any, Dict]: + """ + Create the Bullet Small Reach environment and its control dictionary. + + Parameters: + ---------- + orig_cwd : str, optional + Original current working directory (default is './') + device : str, optional + Device (default is 'cpu') + + Returns: + ---------- + Any + The Bullet Small Reach environment. + dict + The control dictionary. + """ + + env = BulletBallSmallReach(gym.make('SafetyBallSmallReach-v0')) + + # create controller + control_dict = { + "SafeScripted": { + "coord": None, + "local_expert": SafeScripted() + }, + } + + return env, control_dict diff --git a/RLLG/envs/bullet_small_reach/local_expert_policy.py b/RLLG/envs/bullet_small_reach/local_expert_policy.py index 2ba72da7..7af98c03 100644 --- a/RLLG/envs/bullet_small_reach/local_expert_policy.py +++ b/RLLG/envs/bullet_small_reach/local_expert_policy.py @@ -1,52 +1,100 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import numpy as np - - -class SafeScripted: - - def __init__(self): - pass - - def get_quarter_position(self, agent, obstacle): - pos_x, pos_y = agent.get_position()[:2] - obstacle_x, obstacle_y = obstacle.get_position()[:2] - if pos_x <= obstacle_x: - if pos_y <= obstacle_y: - return 'below-left' - return 'above-left' - if pos_y <= obstacle_y: - return 'below-right' - return 'above-right' - - def get_closest_obstacle(self, env): - agent_pos = env.env.env.agent.get_position()[:2] - pos_los = [obstacle.get_position()[:2] for obstacle in env.env.obstacles] - return np.argmin(np.linalg.norm(np.vstack(pos_los) - agent_pos, axis=1)) - - def get_action(self, observation, init_action=None, env=None): - - # get closest obstacle - id = self.get_closest_obstacle(env) - - # get quarter for chosen obstacle - quarter = self.get_quarter_position(env.env.env.agent, env.env.obstacles[id]) - - if quarter == 'below-left': - return np.array([ -0.999, -0.999 ]) - elif quarter == 'below-right': - return np.array([ 0.999, -0.999 ]) - elif quarter == 'above-left': - return np.array([ -0.999, 0.999 ]) - elif quarter == 'above-right': - return np.array([ 0.999, 0.999 ]) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Any, Optional +import numpy as np + + +class SafeScripted: + """ + SafeScripted class for scripted control. + """ + + def __init__(self) -> None: + pass + + def get_quarter_position(self, agent: Any, obstacle: Any) -> str: + """ + Get the quarter position. + + Parameters: + ---------- + agent : Any + The agent object. + obstacle : Any + The obstacle object. + + Returns: + ---------- + str + The quarter position ('below-left', 'below-right', 'above-left', 'above-right'). + """ + pos_x, pos_y = agent.get_position()[:2] + obstacle_x, obstacle_y = obstacle.get_position()[:2] + if pos_x <= obstacle_x: + if pos_y <= obstacle_y: + return 'below-left' + return 'above-left' + if pos_y <= obstacle_y: + return 'below-right' + return 'above-right' + + def get_closest_obstacle(self, env: Any) -> int: + """ + Get the index of the closest obstacle. + + Parameters: + ---------- + env : Any + The environment object. + + Returns: + ---------- + int + The index of the closest obstacle. + """ + agent_pos = env.env.env.agent.get_position()[:2] + pos_los = [obstacle.get_position()[:2] for obstacle in env.env.obstacles] + return np.argmin(np.linalg.norm(np.vstack(pos_los) - agent_pos, axis=1)) + + def get_action(self, observation: np.ndarray, init_action: Optional[Any] = None, env: Optional[Any] = None)\ + -> np.ndarray: + """ + Get the action for scripted control. + + Parameters: + ---------- + observation : Any + The observation. + init_action : Any, optional + The initial action (default is None). + env : Any, optional + The environment object (default is None). + + Returns: + ---------- + np.ndarray + The scripted action. + """ + + # get closest obstacle + id = self.get_closest_obstacle(env) + + # get quarter for chosen obstacle + quarter = self.get_quarter_position(env.env.env.agent, env.env.obstacles[id]) + + if quarter == 'below-left': + return np.array([ -0.999, -0.999 ]) + elif quarter == 'below-right': + return np.array([ 0.999, -0.999 ]) + elif quarter == 'above-left': + return np.array([ -0.999, 0.999 ]) + elif quarter == 'above-right': + return np.array([ 0.999, 0.999 ]) diff --git a/RLLG/envs/cartpole/confidence.py b/RLLG/envs/cartpole/confidence.py index cdb4bacf..0f861f5e 100644 --- a/RLLG/envs/cartpole/confidence.py +++ b/RLLG/envs/cartpole/confidence.py @@ -1,46 +1,91 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -from math import exp - - -class LambdaS: - - def __init__(self, - pos_tol=None, - speed_tol=None, - smoothed=False): - self.pos_tol = pos_tol - self.speed_tol = speed_tol - self.smoothed = smoothed - - def get_use_local(self, env, observation): - abs_pos = abs(observation[0]) - if self.smoothed: - if abs_pos < 0.5: - return 0 - elif abs_pos > 1.2: - return 1 - return exp(- 3 * (1.2 - abs_pos)) - else: - if 1.9 - abs_pos < abs(self.pos_tol): - return 1 - return 0 - - -def cartpole_lambda_s(expert, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=False - ): - return LambdaS(pos_tol=pos_tol, speed_tol=speed_tol, smoothed=smoothed) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + +from typing import Union, Any, Dict, List, Optional, Tuple, Callable +from math import exp + + +class LambdaS: + """ + Class representing the confidence function. + + Parameters: + ---------- + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool, optional + Have a smooth confidence function or not (default is 3.) + """ + + def __init__(self, + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = False): + self.pos_tol = pos_tol + self.speed_tol = speed_tol + self.smoothed = smoothed + + def get_use_local(self, env: Any, observation: List) -> float: + """ + Get the lambda s value based on the environment and observation. + + Parameters: + ---------- + env : Any + The environment + observation : list of array + The observation. + + Returns: + ---------- + float + Use_local value (0 or 1). + """ + abs_pos = abs(observation[0]) + if self.smoothed: + if abs_pos < 0.5: + return 0 + elif abs_pos > 1.2: + return 1 + return exp(- 3 * (1.2 - abs_pos)) + else: + if 1.9 - abs_pos < abs(self.pos_tol): + return 1 + return 0 + + +def cartpole_lambda_s(expert: Any, + device: str = "cpu", + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = None) -> LambdaS: + """ + Returns the confidence LambdaS instance for the cartpole environment. + + Parameters: + ---------- + expert : Any + Expert (not used, but here in case the lambda_s depends on the expert). + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool or None, optional + Whether to use smoothed lambda_s (default is None) + + Returns: + ---------- + LambdaS + The LambdaS instance + """ + return LambdaS(pos_tol=pos_tol, speed_tol=speed_tol, smoothed=smoothed) diff --git a/RLLG/envs/cartpole/create_cartpole.py b/RLLG/envs/cartpole/create_cartpole.py index e6fd4ba9..6c4b3408 100644 --- a/RLLG/envs/cartpole/create_cartpole.py +++ b/RLLG/envs/cartpole/create_cartpole.py @@ -1,60 +1,114 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2020 dm-control (https://github.com/deepmind/dm_control). - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -# The initialization and termination function of the environment has been slightly changed from the original one. - - -import dmc2gym -from envs.cartpole.local_expert_policy import SafeScripted -import os -from types import MethodType - -def new_get_termination(limit_cart=0.6): - def new_get_termination_fn(self, physics): - pos = physics.named.data.qpos['slider'][0] - if abs(pos) > limit_cart: - return 1 - return new_get_termination_fn - -def new_get_reward(limit_cart=0.6, reward_end=1): - def new_get_reward_fn(self, physics): - """Returns a sparse or a smooth reward, as specified in the constructor.""" - pos = physics.named.data.qpos['slider'][0] - if abs(pos) > limit_cart: - return -reward_end - return self._get_reward(physics, sparse=self._sparse) - return new_get_reward_fn - - -def create_cartpole_and_control(orig_cwd='./', - device="cpu", - task_name="swingup", - limit_cart=0.6, - reward_end=1, - pos_tol=1.): - - # create env - env = dmc2gym.make(domain_name="cartpole", task_name=task_name) - - # change termination and reward function - env.env.task.get_termination = MethodType(new_get_termination(limit_cart=limit_cart), env.env.task) - env.env.task.get_reward = MethodType(new_get_reward(limit_cart=limit_cart, - reward_end=reward_end), env.env.task) - - # create controller - path = os.path.join(orig_cwd, 'envs', 'cartpole', "models") - control_dict = { - "SafeScripted": { - "coord": None, - "local_expert": SafeScripted() - }, - } - - return env, control_dict +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2020 dm-control (https://github.com/deepmind/dm_control). + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# The initialization and termination function of the environment has been slightly changed from the original one. + + +import dmc2gym +from envs.cartpole.local_expert_policy import SafeScripted +import os +from types import MethodType +from typing import Callable, Tuple, Any, Dict + + +def new_get_termination(limit_cart: float = 0.6) -> Callable: + """ + Return a new termination function for the Cartpole environment. + + Parameters: + ---------- + limit_cart : float, optional + Cart position limit (default is 0.6). + + Returns: + ---------- + Callable + New termination function. + """ + def new_get_termination_fn(self, physics): + pos = physics.named.data.qpos['slider'][0] + if abs(pos) > limit_cart: + return 1 + return new_get_termination_fn + + +def new_get_reward(limit_cart: float = 0.6, reward_end: int = 1) -> Callable: + """ + Return a new reward function for the Cartpole environment + + Parameters: + ---------- + limit_cart : float, optional + Cart position limit (default is 0.6) + reward_end : int, optional + Reward value when the limit is reached (default is 1) + + Returns: + ---------- + Callable + New reward function + """ + def new_get_reward_fn(self, physics): + """Returns a sparse or a smooth reward, as specified in the constructor.""" + pos = physics.named.data.qpos['slider'][0] + if abs(pos) > limit_cart: + return -reward_end + return self._get_reward(physics, sparse=self._sparse) + return new_get_reward_fn + + +def create_cartpole_and_control(orig_cwd: str = './', + device: str = "cpu", + task_name: str = "swingup", + limit_cart: float = 0.6, + reward_end: int = 1, + pos_tol: float = 1.) -> Tuple[Any, Dict]: + """ + Create the Cartpole environment and its control dictionary + + Parameters: + ---------- + orig_cwd : str, optional + Original current working directory (default is './') + device : str, optional + Device (default is 'cpu') + task_name : str, optional + Task name (default is 'swingup') + limit_cart : float, optional + Cart position limit (default is 0.6) + reward_end : int, optional + Reward value when the limit is reached (default is 1) + pos_tol : float, optional + Position tolerance (default is 1.) + + Returns: + ---------- + Tuple[Any, dict] + The Cartpole environment and the control dictionary + """ + + # create env + env = dmc2gym.make(domain_name="cartpole", task_name=task_name) + + # change termination and reward function + env.env.task.get_termination = MethodType(new_get_termination(limit_cart=limit_cart), env.env.task) + env.env.task.get_reward = MethodType(new_get_reward(limit_cart=limit_cart, + reward_end=reward_end), env.env.task) + + # create controller + path = os.path.join(orig_cwd, 'envs', 'cartpole', "models") + control_dict = { + "SafeScripted": { + "coord": None, + "local_expert": SafeScripted() + }, + } + + return env, control_dict diff --git a/RLLG/envs/cartpole/local_expert_policy.py b/RLLG/envs/cartpole/local_expert_policy.py index 6e8f3d2b..8e49af76 100644 --- a/RLLG/envs/cartpole/local_expert_policy.py +++ b/RLLG/envs/cartpole/local_expert_policy.py @@ -1,25 +1,43 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import numpy as np - - -class SafeScripted: - - def __init__(self): - pass - - def get_action(self, observation, init_action=None): - pos = observation[0] - if pos > 0: - return np.float32(np.array([-0.999])) - return np.float32(np.array([0.999])) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + +from typing import Optional +import numpy as np + + +class SafeScripted: + """ + SafeScripted class for scripted control. + """ + + def __init__(self) -> None: + pass + + def get_action(self, observation: np.ndarray, init_action: Optional[np.ndarray] = None) -> np.ndarray: + """ + Get the action for scripted control. + + Parameters: + ---------- + observation : np.ndarray + The observation. + init_action : Any, optional + The initial action (default is None). + + Returns: + ---------- + np.ndarray + The scripted action. + """ + pos = observation[0] + if pos > 0: + return np.float32(np.array([-0.999])) + return np.float32(np.array([0.999])) diff --git a/RLLG/envs/confidence.py b/RLLG/envs/confidence.py index 392c14b2..36d297bc 100644 --- a/RLLG/envs/confidence.py +++ b/RLLG/envs/confidence.py @@ -1,42 +1,65 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - -from envs.cartpole.confidence import cartpole_lambda_s -from envs.ball_in_cup.confidence import ball_in_cup_lambda_s -from envs.point_mass.confidence import point_mass_lambda_s -from envs.point_circle.confidence import point_circle_lambda_s -from envs.bullet_small_reach.confidence import bullet_small_reach_lambda_s -from envs.hirl_point_fall.confidence import hirl_point_fall_lambda_s - - -dict_norm_to_expert = { - 'cartpole': cartpole_lambda_s, - 'ball_in_cup': ball_in_cup_lambda_s, - 'point_mass': point_mass_lambda_s, - 'point_circle': point_circle_lambda_s, - 'hirl_point_fall': hirl_point_fall_lambda_s, - 'bullet_small_reach': bullet_small_reach_lambda_s, -} - - -def global_lambda_s(glob_name, - experts, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=False - ): - return dict_norm_to_expert[glob_name](experts, - device=device, - pos_tol=pos_tol, - speed_tol=speed_tol, - smoothed=smoothed - ) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Union, Any, List +import torch +from envs.cartpole.confidence import cartpole_lambda_s +from envs.ball_in_cup.confidence import ball_in_cup_lambda_s +from envs.point_mass.confidence import point_mass_lambda_s +from envs.point_circle.confidence import point_circle_lambda_s +from envs.bullet_small_reach.confidence import bullet_small_reach_lambda_s +from envs.hirl_point_fall.confidence import hirl_point_fall_lambda_s + + +dict_norm_to_expert = { + 'cartpole': cartpole_lambda_s, + 'ball_in_cup': ball_in_cup_lambda_s, + 'point_mass': point_mass_lambda_s, + 'point_circle': point_circle_lambda_s, + 'hirl_point_fall': hirl_point_fall_lambda_s, + 'bullet_small_reach': bullet_small_reach_lambda_s, +} + + +def global_lambda_s(glob_name: str, + experts: List[torch.nn.Module], + device: str = "cpu", + pos_tol: Union[float, None] = None, + speed_tol: Union[float, None] = None, + smoothed: bool = False) -> Any: + """ + Returns the confidence lambda_s function based on the specified environment type. + + Parameters: + ---------- + glob_name : str + Name representing the environment type. + experts : List[torch.nn.Module] + List of expert models. + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool, optional + Whether to use smoothed lambda_s (default is False) + + Returns: + ---------- + Any + The global confidence lambda_s function. + """ + return dict_norm_to_expert[glob_name](experts, + device=device, + pos_tol=pos_tol, + speed_tol=speed_tol, + smoothed=smoothed + ) diff --git a/RLLG/envs/creation.py b/RLLG/envs/creation.py index b7876083..cd08aa5d 100644 --- a/RLLG/envs/creation.py +++ b/RLLG/envs/creation.py @@ -1,71 +1,87 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - -from envs.cartpole.create_cartpole import create_cartpole_and_control -from envs.ball_in_cup.create_ball_in_cup import create_ball_in_cup_and_control -from envs.point_mass.create_point_mass import create_point_mass_and_control -from envs.point_circle.create_point_circle import create_point_cirlce_and_control -from envs.bullet_small_reach.create_bullet_small_reach import create_bullet_small_reach_and_control -from envs.hirl_point_fall.create_hirl_point_fall import create_hirl_point_fall_and_control - - -dict_fn = { - 'cartpole': create_cartpole_and_control, - 'ball_in_cup': create_ball_in_cup_and_control, - 'point_mass': create_point_mass_and_control, - 'point_circle': create_point_cirlce_and_control, - 'bullet_small_reach': create_bullet_small_reach_and_control, - 'hirl_point_fall': create_hirl_point_fall_and_control, -} - - -def get_env_and_control(name='ball_in_cup', - orig_cwd='./', - device='cpu', - limit_cart=0.6, - reward_end=1, - pos_tol=1.): - """ - Returns required env and local(s) controller. - The env is a Gym environment. - The local controller is a dictionary with: - - key: point where the linearization happened - - value: control function (taking the state as an argument) - """ - kwargs = {} - - # get glob name - if 'pendulum' in name: - glob_name = 'pendulum' - elif 'cartpole' in name: - glob_name = 'cartpole' - kwargs.update({'limit_cart': limit_cart, 'reward_end': reward_end, 'pos_tol': pos_tol}) - elif 'point_mass' in name: - glob_name = 'point_mass' - elif 'hirl_point_fall' in name: - glob_name = 'hirl_point_fall' - if 'move_block_only' in name: - kwargs = {'move_block_only': True} - else: - glob_name = name - - if "sparse" in name: - kwargs.update({'sparse': True}) - - if "cartpole" in name: - kwargs.update({'task_name': name.split('-')[-1]}) - - # get env and control - env, dict_control = dict_fn[glob_name](orig_cwd=orig_cwd, - device=device, - **kwargs) - - return env, dict_control +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Union, Any, Dict, List, Optional, Tuple, Callable +from envs.cartpole.create_cartpole import create_cartpole_and_control +from envs.ball_in_cup.create_ball_in_cup import create_ball_in_cup_and_control +from envs.point_mass.create_point_mass import create_point_mass_and_control +from envs.point_circle.create_point_circle import create_point_cirlce_and_control +from envs.bullet_small_reach.create_bullet_small_reach import create_bullet_small_reach_and_control +from envs.hirl_point_fall.create_hirl_point_fall import create_hirl_point_fall_and_control + + +dict_fn = { + 'cartpole': create_cartpole_and_control, + 'ball_in_cup': create_ball_in_cup_and_control, + 'point_mass': create_point_mass_and_control, + 'point_circle': create_point_cirlce_and_control, + 'bullet_small_reach': create_bullet_small_reach_and_control, + 'hirl_point_fall': create_hirl_point_fall_and_control, +} + + +def get_env_and_control(name: str = 'ball_in_cup', + orig_cwd: str = './', + device: str = 'cpu', + limit_cart: Optional[float] = 0.6, + reward_end: Optional[int] = 1, + pos_tol: Optional[float] = 1.) -> Tuple[Any, Dict[Union[str, Tuple[float, float]], Callable]]: + """ + Returns the environment and local controller. + + Parameters: + ---------- + name : str, optional + Name of the environment (default is 'ball_in_cup') + orig_cwd : str, optional + Original working directory (default is './') + device : str, optional + Device for computation (default is 'cpu') + limit_cart : float, optional + Limit for the cart (default is 0.6) + reward_end : int, optional + Reward at the end (default is 1) + pos_tol : float, optional + Position tolerance (default is 1.0) + + Returns: + ---------- + Tuple[Any, Dict[Union[str, Tuple[float, float]], Callable]] + The environment and local controller. + """ + kwargs = {} + + # get glob name + if 'pendulum' in name: + glob_name = 'pendulum' + elif 'cartpole' in name: + glob_name = 'cartpole' + kwargs.update({'limit_cart': limit_cart, 'reward_end': reward_end, 'pos_tol': pos_tol}) + elif 'point_mass' in name: + glob_name = 'point_mass' + elif 'hirl_point_fall' in name: + glob_name = 'hirl_point_fall' + if 'move_block_only' in name: + kwargs = {'move_block_only': True} + else: + glob_name = name + + if "sparse" in name: + kwargs.update({'sparse': True}) + + if "cartpole" in name: + kwargs.update({'task_name': name.split('-')[-1]}) + + # get env and control + env, dict_control = dict_fn[glob_name](orig_cwd=orig_cwd, + device=device, + **kwargs) + + return env, dict_control diff --git a/RLLG/envs/env_utils.py b/RLLG/envs/env_utils.py index 47ebb58a..93444a26 100644 --- a/RLLG/envs/env_utils.py +++ b/RLLG/envs/env_utils.py @@ -1,56 +1,115 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2016 OpenAI (https://openai.com). - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -# This is taken from the gym repository - -import gym - - -# https://github.com/openai/gym/blob/master/gym/core.py -class NormalizedEnv(gym.ActionWrapper): - """ Normalize action space """ - - def __init__(self, env): - super(NormalizedEnv, self).__init__(env) - - def action(self, action): - act_k = (self.action_space.high - self.action_space.low) / 2. - act_b = (self.action_space.high + self.action_space.low) / 2. - return act_k * action + act_b - - def reverse_action(self, action): - act_k_inv = 2. / (self.action_space.high - self.action_space.low) - act_b = (self.action_space.high + self.action_space.low) / 2. - return act_k_inv * (action - act_b) - - -class ForcedTimeLimit(gym.Wrapper): - - def __init__(self, env, max_episode_steps=None): - super().__init__(env) - if max_episode_steps is None and self.env.spec is not None: - max_episode_steps = env.spec.max_episode_steps - if self.env.spec is not None: - self.env.spec.max_episode_steps = max_episode_steps - self._max_episode_steps = max_episode_steps - self._elapsed_steps = None - - def step(self, action): - observation, reward, done, info = self.env.step(action) - done = False - self._elapsed_steps += 1 - if self._elapsed_steps >= self._max_episode_steps: - info["TimeLimit.truncated"] = not done - done = True - return observation, reward, done, info - - def reset(self, **kwargs): - self._elapsed_steps = 0 - return self.env.reset(**kwargs) +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2016 OpenAI (https://openai.com). + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# This is taken from the gym repository + +from typing import Any, Dict, List, Tuple +import gym +import numpy as np + + +# https://github.com/openai/gym/blob/master/gym/core.py +class NormalizedEnv(gym.ActionWrapper): + """ Normalize action space """ + + def __init__(self, env: gym.Env) -> None: + super(NormalizedEnv, self).__init__(env) + + def action(self, action: np.ndarray) -> np.ndarray: + """ + Normalize the action. + + Parameters: + ---------- + action : np.ndarray + The original action + + Returns: + ---------- + np.ndarray + The normalized action. + """ + act_k = (self.action_space.high - self.action_space.low) / 2. + act_b = (self.action_space.high + self.action_space.low) / 2. + return act_k * action + act_b + + def reverse_action(self, action: np.ndarray) -> np.ndarray: + """ + Reverse the normalized action. + + Parameters: + ---------- + action : np.ndarray + The normalized action. + + Returns: + ---------- + np.ndarray + The original action. + """ + act_k_inv = 2. / (self.action_space.high - self.action_space.low) + act_b = (self.action_space.high + self.action_space.low) / 2. + return act_k_inv * (action - act_b) + + +class ForcedTimeLimit(gym.Wrapper): + """ + A wrapper for enforcing a maximum number of steps in an episode. + + Parameters: + ---------- + env : gym.Env + The underlying environment. + max_episode_steps : Optional[int] + The maximum number of steps in an episode. + """ + + def __init__(self, env: gym.Env, max_episode_steps: int = None) -> None: + super().__init__(env) + if max_episode_steps is None and self.env.spec is not None: + max_episode_steps = env.spec.max_episode_steps + if self.env.spec is not None: + self.env.spec.max_episode_steps = max_episode_steps + self._max_episode_steps = max_episode_steps + self._elapsed_steps = None + + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, Dict]: + """ + Run one timestep of the environment's dynamics, and only return done=True if max_time_steps has been reached. + + Parameters: + ---------- + action : np.ndarray + The action to be executed + + Returns: + ---------- + tuple + Observation, reward, done, and info. + """ + observation, reward, done, info = self.env.step(action) + done = False + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + info["TimeLimit.truncated"] = not done + done = True + return observation, reward, done, info + + def reset(self, **kwargs): + """ + Reset the environment. + + Returns: + ---------- + np.ndarray + The initial observation. + """ + self._elapsed_steps = 0 + return self.env.reset(**kwargs) diff --git a/RLLG/envs/hirl_point_fall/confidence.py b/RLLG/envs/hirl_point_fall/confidence.py index 13bfacd9..46766ff1 100644 --- a/RLLG/envs/hirl_point_fall/confidence.py +++ b/RLLG/envs/hirl_point_fall/confidence.py @@ -1,30 +1,74 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -class LambdaS: - - def __init__(self, pos_tol=1.): - self.pos_tol = pos_tol - - def get_use_local(self, env, observation): - if int(observation[4]) == 1: - return 0 - return 1 - - -def hirl_point_fall_lambda_s(expert, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=False - ): - return LambdaS() +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Union, Any, Dict, List, Optional, Tuple, Callable + + +class LambdaS: + """ + Class representing the confidence function. + + Parameters: + ---------- + pos_tol : float or None, optional + Position tolerance (default is 1.) + """ + + def __init__(self, pos_tol: float = 1.): + self.pos_tol = pos_tol + + def get_use_local(self, env: Any, observation: List) -> float: + """ + Get the lambda s value based on the environment and observation. + + Parameters: + ---------- + env : Any + The environment + observation : list of array + The observation. + + Returns: + ---------- + float + Use_local value (0 or 1). + """ + if int(observation[4]) == 1: + return 0 + return 1 + + +def hirl_point_fall_lambda_s(expert: Any, + device: str = "cpu", + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = None) -> LambdaS: + """ + Returns the confidence LambdaS instance for the point fall environment. + + Parameters: + ---------- + expert : Any + Expert (not used, but here in case the lambda_s depends on the expert). + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool or None, optional + Whether to use smoothed lambda_s (default is None) + + Returns: + ---------- + LambdaS + The LambdaS instance + """ + return LambdaS() diff --git a/RLLG/envs/hirl_point_fall/create_hirl_point_fall.py b/RLLG/envs/hirl_point_fall/create_hirl_point_fall.py index 0ea8bd39..1af48274 100644 --- a/RLLG/envs/hirl_point_fall/create_hirl_point_fall.py +++ b/RLLG/envs/hirl_point_fall/create_hirl_point_fall.py @@ -1,37 +1,55 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -from envs.hirl_point_fall.point_fall import PointFallEnv -from envs.hirl_point_fall.local_expert_policy import SACExpert -from envs.hirl_point_fall.wrapper import ForcedTimeLimit -from envs.env_utils import NormalizedEnv -import os - - -def create_hirl_point_fall_and_control(move_block_only=False, - orig_cwd='./', - device="cpu"): - - init_env = PointFallEnv(move_block_only=move_block_only, scaling_factor=4, max_steps=1000) - env = ForcedTimeLimit(NormalizedEnv(init_env), max_episode_steps=1000) - - path = os.path.join(orig_cwd, 'envs', 'hirl_point_fall', "models") - - # create controller - control_dict = { - "MediumSAC": { - "coord": None, - "local_expert": SACExpert(env, path, device) - }, - } - - return env, control_dict +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + + +from typing import Any, Tuple, Dict +from envs.hirl_point_fall.point_fall import PointFallEnv +from envs.hirl_point_fall.local_expert_policy import SACExpert +from envs.hirl_point_fall.wrapper import ForcedTimeLimit +from envs.env_utils import NormalizedEnv +import os + + +def create_hirl_point_fall_and_control(move_block_only: bool = False, + orig_cwd: str = './', + device: str = "cpu") -> Tuple[Any, Dict[str, Any]]: + """ + Create the Point Fall environment and its associated controller. + + Parameters: + ---------- + move_block_only : bool, optional + If True, move only the block; if False, move both the block and the robot (default is False) + orig_cwd : str, optional + Original current working directory (default is './') + device : str, optional + Device to run the environment on (default is "cpu") + + Returns: + ---------- + Tuple[An, Dict[str, Any]] + Tuple containing the environment and the controller dictionary. + """ + + init_env = PointFallEnv(move_block_only=move_block_only, scaling_factor=4, max_steps=1000) + env = ForcedTimeLimit(NormalizedEnv(init_env), max_episode_steps=1000) + + path = os.path.join(orig_cwd, 'envs', 'hirl_point_fall', "models") + + # create controller + control_dict = { + "MediumSAC": { + "coord": None, + "local_expert": SACExpert(env, path, device) + }, + } + + return env, control_dict diff --git a/RLLG/envs/hirl_point_fall/local_expert_policy.py b/RLLG/envs/hirl_point_fall/local_expert_policy.py index 94cd59a3..a834b3ab 100644 --- a/RLLG/envs/hirl_point_fall/local_expert_policy.py +++ b/RLLG/envs/hirl_point_fall/local_expert_policy.py @@ -1,47 +1,75 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import numpy as np -import torch -import os - -class SACExpert: - - def __init__(self, env, path, device="cpu"): - - from agents.common.model import TanhGaussianPolicy, SamplerPolicy - # hyper-params - policy_arch = '32-32' - policy_log_std_multiplier = 1.0 - policy_log_std_offset = -1.0 - - # load expert policy - expert_policy = TanhGaussianPolicy( - env.observation_space.shape[0], - env.action_space.shape[0], - policy_arch, - log_std_multiplier=policy_log_std_multiplier, - log_std_offset=policy_log_std_offset, - ) - glob_path = os.path.join(path, 'medium_sac') - expert_policy.load_state_dict(torch.load(glob_path)) - expert_policy.to(device) - self.sampling_expert_policy = SamplerPolicy(expert_policy, device=device) - - def get_action(self, observation, init_action=None, env=None): - with torch.no_grad(): - expert_action = self.sampling_expert_policy( - np.expand_dims(observation, 0), deterministic=True - )[0, :] - # to further decrease performance - expert_action[0] *= 0.2 - return np.clip(expert_action, a_min=-0.99, a_max=0.99) # expert_action +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + +from typing import Any, Optional +import numpy as np +import torch +import os + + +class SACExpert: + """ + Soft Actor-Critic (SAC) Expert. + + Parameters: + ---------- + env : Any + The environment (usually dm control env, could be gym as well or others). + path : str + The path to the model. + device : str, optional + The device to run the expert policy (default is 'cpu'). + """ + + def __init__(self, env: Any, path: str, device: Optional[str] = "cpu") -> None: + + from agents.common.model import TanhGaussianPolicy, SamplerPolicy + # hyper-params + policy_arch = '32-32' + policy_log_std_multiplier = 1.0 + policy_log_std_offset = -1.0 + + # load expert policy + expert_policy = TanhGaussianPolicy( + env.observation_space.shape[0], + env.action_space.shape[0], + policy_arch, + log_std_multiplier=policy_log_std_multiplier, + log_std_offset=policy_log_std_offset, + ) + glob_path = os.path.join(path, 'medium_sac') + expert_policy.load_state_dict(torch.load(glob_path)) + expert_policy.to(device) + self.sampling_expert_policy = SamplerPolicy(expert_policy, device=device) + + def get_action(self, observation: np.ndarray, init_action: Any = None, env: Any = None) -> np.ndarray: + """ + Get an action from the SAC expert policy. + + Parameters: + ---------- + observation : numpy.ndarray + The observation from the environment. + init_action : Any, optional + Initial action (default is None). + env : gym.Env, optional + The environment (default is None). + + Returns: + ---------- + numpy.ndarray + The clipped expert action. + """ + with torch.no_grad(): + expert_action = self.sampling_expert_policy( + np.expand_dims(observation, 0), deterministic=True + )[0, :] + # to further decrease performance + expert_action[0] *= 0.2 + return np.clip(expert_action, a_min=-0.99, a_max=0.99) # expert_action diff --git a/RLLG/envs/hirl_point_fall/wrapper.py b/RLLG/envs/hirl_point_fall/wrapper.py index b89f807c..8a5a62e1 100644 --- a/RLLG/envs/hirl_point_fall/wrapper.py +++ b/RLLG/envs/hirl_point_fall/wrapper.py @@ -1,37 +1,72 @@ -# 2023.02.14-Changed for RLLG -# Huawei Technologies Co., Ltd. - -# Copyright (c) 2016 OpenAI (https://openai.com). - -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - - -import gym - - -class ForcedTimeLimit(gym.Wrapper): - - def __init__(self, env, max_episode_steps=None): - super().__init__(env) - if max_episode_steps is None and self.env.spec is not None: - max_episode_steps = env.spec.max_episode_steps - if self.env.spec is not None: - self.env.spec.max_episode_steps = max_episode_steps - self._max_episode_steps = max_episode_steps - self._elapsed_steps = None - - def step(self, action): - observation, reward, done, info = self.env.step(action) - done = False - self._elapsed_steps += 1 - if self._elapsed_steps >= self._max_episode_steps: - info["TimeLimit.truncated"] = not done - done = True - return observation, reward, done, info - - def reset(self, **kwargs): - self._elapsed_steps = 0 - return self.env.reset(**kwargs) +# 2023.02.14-Changed for RLLG +# Huawei Technologies Co., Ltd. + +# Copyright (c) 2016 OpenAI (https://openai.com). + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import gym +import numpy as np +from typing import Any, Optional, Dict, Tuple + + +class ForcedTimeLimit(gym.Wrapper): + """ + A wrapper for enforcing a maximum number of steps in an episode. + + Parameters: + ---------- + env : gym.Env + The underlying environment. + max_episode_steps : Optional[int] + The maximum number of steps in an episode. + elapsed_steps : Optional[int] + The number of steps taken in the current episode. + """ + + def __init__(self, env: gym.Env, max_episode_steps: Optional[int] = None) -> None: + super().__init__(env) + if max_episode_steps is None and self.env.spec is not None: + max_episode_steps = env.spec.max_episode_steps + if self.env.spec is not None: + self.env.spec.max_episode_steps = max_episode_steps + self._max_episode_steps = max_episode_steps + self._elapsed_steps = None + + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, Dict]: + """ + Take a step in the environment. + + Parameters: + ---------- + action : Any + The action to be taken. + + Returns: + ---------- + Tuple + The observation, reward, done, and info. + """ + observation, reward, done, info = self.env.step(action) + done = False + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + info["TimeLimit.truncated"] = not done + done = True + return observation, reward, done, info + + def reset(self, **kwargs: Any) -> Any: + """ + Reset the environment. + + Returns: + ---------- + Any + The initial observation. + """ + self._elapsed_steps = 0 + return self.env.reset(**kwargs) diff --git a/RLLG/envs/point_circle/confidence.py b/RLLG/envs/point_circle/confidence.py index 82a77c39..49475314 100644 --- a/RLLG/envs/point_circle/confidence.py +++ b/RLLG/envs/point_circle/confidence.py @@ -1,30 +1,75 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - -class LambdaS: - - def __init__(self, pos_tol=0.6): - self.pos_tol = pos_tol - - def get_use_local(self, env, observation): - x_pos, y_pos, z_pos = env.env.world.robot_pos() - if abs(x_pos) > self.pos_tol: - return 1 - return 0 - - -def point_circle_lambda_s(expert, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=False - ): - return LambdaS(pos_tol=pos_tol) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Union, Any, Dict, List, Optional, Tuple, Callable + + +class LambdaS: + """ + Class representing the confidence function. + + Parameters: + ---------- + pos_tol : float or None, optional + Position tolerance (default is 0.6) + """ + + def __init__(self, pos_tol: float =0.6): + self.pos_tol = pos_tol + + def get_use_local(self, env: Any, observation: List) -> float: + """ + Get the lambda s value based on the environment and observation. + + Parameters: + ---------- + env : Any + The environment + observation : list of array + The observation. + + Returns: + ---------- + float + Use_local value (0 or 1). + """ + x_pos, y_pos, z_pos = env.env.world.robot_pos() + if abs(x_pos) > self.pos_tol: + return 1 + return 0 + + +def point_circle_lambda_s(expert: Any, + device: str = "cpu", + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = None) -> LambdaS: + """ + Returns the confidence LambdaS instance for the point circle environment. + + Parameters: + ---------- + expert : Any + Expert (not used, but here in case the lambda_s depends on the expert). + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool or None, optional + Whether to use smoothed lambda_s (default is None) + + Returns: + ---------- + LambdaS + The LambdaS instance + """ + return LambdaS(pos_tol=pos_tol) diff --git a/RLLG/envs/point_circle/create_point_circle.py b/RLLG/envs/point_circle/create_point_circle.py index b909c6d9..d4091062 100644 --- a/RLLG/envs/point_circle/create_point_circle.py +++ b/RLLG/envs/point_circle/create_point_circle.py @@ -1,44 +1,59 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -try: - from safety_gym.envs.engine import Engine -except ModuleNotFoundError: - pass -from envs.point_circle.point_circle import PointCircle -from envs.point_circle.local_expert_policy import SafeScripted -import os -from types import MethodType - - -def create_point_cirlce_and_control(orig_cwd='./', - device="cpu"): - config_dict = { - 'robot_base': 'xmls/point.xml', - 'task': 'circle', - 'observe_goal_lidar': False, - 'observe_box_lidar': False, - 'observe_circle': True, - 'lidar_max_dist': 6 - } - init_env = Engine(config=config_dict) - env = PointCircle(init_env) - - # create controller - control_dict = { - "SafeScripted": { - "coord": None, - "local_expert": SafeScripted() - }, - } - - return env, control_dict +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + + +try: + from safety_gym.envs.engine import Engine +except ModuleNotFoundError: + pass +from typing import Any, Tuple, Dict +from envs.point_circle.point_circle import PointCircle +from envs.point_circle.local_expert_policy import SafeScripted +import os + + +def create_point_cirlce_and_control(orig_cwd: str ='./', + device: str ="cpu") -> Tuple[Any, Dict]: + """ + Create the Point Circle environment and its associated controller. + + Parameters: + ---------- + orig_cwd : str, optional + Original current working directory (default is './') + device : str, optional + Device to run the environment on (default is "cpu") + + Returns: + ---------- + Tuple[Any, Dict[str, Any]] + Tuple containing the environment and the controller dictionary. + """ + config_dict = { + 'robot_base': 'xmls/point.xml', + 'task': 'circle', + 'observe_goal_lidar': False, + 'observe_box_lidar': False, + 'observe_circle': True, + 'lidar_max_dist': 6 + } + init_env = Engine(config=config_dict) + env = PointCircle(init_env) + + # create controller + control_dict = { + "SafeScripted": { + "coord": None, + "local_expert": SafeScripted() + }, + } + + return env, control_dict diff --git a/RLLG/envs/point_circle/local_expert_policy.py b/RLLG/envs/point_circle/local_expert_policy.py index 0e04c3a8..bb302e81 100644 --- a/RLLG/envs/point_circle/local_expert_policy.py +++ b/RLLG/envs/point_circle/local_expert_policy.py @@ -1,50 +1,71 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import numpy as np -import torch -import os - - -class SafeScripted: - - def __init__(self): - pass - - def get_action(self, observation, init_action=None, env=None): - x_pos, y_pos, z_pos = env.env.world.robot_pos() - rot_mat = env.env.world.robot_mat() - theta = np.arctan2(-rot_mat[0, 1], rot_mat[0, 0]) - if x_pos > 0: - if abs(theta) >= 3 * np.pi / 4: - if y_pos > 0 and theta > 0: - return np.array([0.999, 0.5]) - elif y_pos < 0 and theta < 0: - return np.array([0.999, -0.5]) - else: - return np.array([0.999, 0]) - elif theta < 0: - return np.array([-0.999, -0.999]) - else: - return np.array([-0.999, 0.999]) - else: - if abs(theta) <= np.pi / 4: - if y_pos > 0 and theta > 0: - return np.array([0.999, -0.5]) - elif y_pos < 0 and theta < 0: - return np.array([0.999, 0.5]) - else: - return np.array([0.999, 0]) - elif theta < 0: - return np.array([-0.999, 0.999]) - else: - return np.array([-0.999, -0.999]) +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + +from typing import Any, Optional +import numpy as np +import torch +import os + + +class SafeScripted: + """ + SafeScripted class for scripted control. + """ + + def __init__(self) -> None: + pass + + def get_action(self, observation: np.ndarray, init_action: Optional[Any] = None, env: Optional[Any] = None)\ + -> np.ndarray: + """ + Get the action for scripted control. + + Parameters: + ---------- + observation : Any + The observation. + init_action : Any, optional + The initial action (default is None). + env : Any, optional + The environment object (default is None). + + Returns: + ---------- + np.ndarray + The scripted action. + """ + x_pos, y_pos, z_pos = env.env.world.robot_pos() + rot_mat = env.env.world.robot_mat() + theta = np.arctan2(-rot_mat[0, 1], rot_mat[0, 0]) + if x_pos > 0: + if abs(theta) >= 3 * np.pi / 4: + if y_pos > 0 and theta > 0: + return np.array([0.999, 0.5]) + elif y_pos < 0 and theta < 0: + return np.array([0.999, -0.5]) + else: + return np.array([0.999, 0]) + elif theta < 0: + return np.array([-0.999, -0.999]) + else: + return np.array([-0.999, 0.999]) + else: + if abs(theta) <= np.pi / 4: + if y_pos > 0 and theta > 0: + return np.array([0.999, -0.5]) + elif y_pos < 0 and theta < 0: + return np.array([0.999, 0.5]) + else: + return np.array([0.999, 0]) + elif theta < 0: + return np.array([-0.999, 0.999]) + else: + return np.array([-0.999, -0.999]) diff --git a/RLLG/envs/point_circle/point_circle.py b/RLLG/envs/point_circle/point_circle.py index aef99a6d..90bbd8a5 100644 --- a/RLLG/envs/point_circle/point_circle.py +++ b/RLLG/envs/point_circle/point_circle.py @@ -1,31 +1,75 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -class PointCircle: - - def __init__(self, env): - self.env = env - self.observation_space = self.env.observation_space - self.action_space = self.env.action_space - - def step(self, action): - obs, reward, done, info = self.env.step(action) - if info['cost'] >= 0.5: - reward = -1000 - done = True - return obs, reward, done, info - - def render(self, mode="human"): - return self.env.render(mode) - - def reset(self): - return self.env.reset() +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Any, Tuple, Dict, Optional +import numpy as np + + +class PointCircle: + """ + Wrapper for the safe PointCircke environment to change the constraint function into a bad reward. + + Parameters: + ---------- + env : Any + The environment to wrap. + """ + + def __init__(self, env: Any) -> None: + self.env = env + self.observation_space = self.env.observation_space + self.action_space = self.env.action_space + + def step(self, action: np.ndarray) -> Tuple[Any, float, bool, Dict]: + """ + Step through the environment dynamics and change reward function. + + Parameters: + ---------- + action : Any + The action to be executed. + + Returns: + ---------- + tuple + Observation, reward, done, and info. + """ + obs, reward, done, info = self.env.step(action) + if info['cost'] >= 0.5: + reward = -1000 + done = True + return obs, reward, done, info + + def render(self, mode: Optional[str] = "human") -> Any: + """ + Render the environment. + + Parameters: + ---------- + mode : str, optional + Rendering mode (default is "human"). + + Returns: + ---------- + Any + The rendering output. + """ + return self.env.render(mode) + + def reset(self) -> np.ndarray: + """ + Reset the environment. + + Returns: + ---------- + Any + The reset observation. + """ + return self.env.reset() diff --git a/RLLG/envs/point_mass/confidence.py b/RLLG/envs/point_mass/confidence.py index 72023509..16499be9 100644 --- a/RLLG/envs/point_mass/confidence.py +++ b/RLLG/envs/point_mass/confidence.py @@ -1,37 +1,83 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - -import numpy as np -from dm_control.utils import rewards - - -class LambdaS: - - def __init__(self, pos_tol=None, speed_tol=None): - self.pos_tol = pos_tol - self.speed_tol = speed_tol - - - def get_use_local(self, env, observation): - # check if inside big target or not - target_size = 0.1 # env.env.physics.named.model.geom_size['target', 0] - inside_big_goal = rewards.tolerance(env.env.physics.mass_to_target_dist(), - bounds=(0, target_size)) - if inside_big_goal: - return 0 - return 1 - -def point_mass_lambda_s(expert, - device="cpu", - pos_tol=None, - speed_tol=None, - smoothed=None): - return LambdaS() +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import Union, Any, Dict, List, Optional, Tuple, Callable +import numpy as np +from dm_control.utils import rewards + + +class LambdaS: + """ + Class representing the confidence function. + + Parameters: + ---------- + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + """ + + def __init__(self, pos_tol: float = None, speed_tol: float = None): + self.pos_tol = pos_tol + self.speed_tol = speed_tol + + + def get_use_local(self, env: Any, observation: List) -> float: + """ + Get the lambda s value based on the environment and observation. + + Parameters: + ---------- + env : Any + The environment + observation : list of array + The observation. + + Returns: + ---------- + float + Use_local value (0 or 1). + """ + # check if inside big target or not + target_size = 0.1 # env.env.physics.named.model.geom_size['target', 0] + inside_big_goal = rewards.tolerance(env.env.physics.mass_to_target_dist(), + bounds=(0, target_size)) + if inside_big_goal: + return 0 + return 1 + +def point_mass_lambda_s(expert: Any, + device: str = "cpu", + pos_tol: float = None, + speed_tol: float = None, + smoothed: bool = None) -> LambdaS: + """ + Returns the confidence LambdaS instance for the point mass environment. + + Parameters: + ---------- + expert : Any + Expert (not used, but here in case the lambda_s depends on the expert). + device : str, optional + Device for computation (default is 'cpu') + pos_tol : float or None, optional + Position tolerance (default is None) + speed_tol : float or None, optional + Speed tolerance (default is None) + smoothed : bool or None, optional + Whether to use smoothed lambda_s (default is None) + + Returns: + ---------- + LambdaS + The LambdaS instance + """ + return LambdaS() diff --git a/RLLG/envs/point_mass/create_point_mass.py b/RLLG/envs/point_mass/create_point_mass.py index 8a98c1c1..67cfb302 100644 --- a/RLLG/envs/point_mass/create_point_mass.py +++ b/RLLG/envs/point_mass/create_point_mass.py @@ -1,55 +1,86 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import dmc2gym -from envs.point_mass.local_expert_policy import SACExpert -import os -from types import MethodType -from dm_control.utils import rewards - - -# modify initialization -def new_get_reward(self, physics): - """Returns a reward to the agent.""" - target_size = physics.named.model.geom_size['target', 0] - near_target = rewards.tolerance(physics.mass_to_target_dist(), - bounds=(0, target_size)) - control_reward = rewards.tolerance(physics.control(), margin=1, - value_at_margin=0, - sigmoid='quadratic').mean() - small_control = (control_reward + 4) / 5 - return near_target * small_control - - -def create_point_mass_and_control(orig_cwd='./', - device="cpu", - sparse=False): - # create env - env = dmc2gym.make('point_mass', 'easy') - - # modify target (to create simple task) - if sparse: - env.env._env._task.get_reward = MethodType(new_get_reward, env.env._env._task) - - # env.env._env.physics.named.model.geom_size['target', 0] = 0.1 - # env.env._env._task._target_size = 0.1 - - path = os.path.join(orig_cwd, 'envs', 'point_mass', 'models') - - control_dict = { - "MediumSAC": { - "coord": None, - "local_expert": SACExpert(env, path, device) - }, - } - +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + +from typing import Any, Tuple, Dict +import dmc2gym +from envs.point_mass.local_expert_policy import SACExpert +import os +from types import MethodType +from dm_control.utils import rewards + + +# modify initialization +def new_get_reward(self: Any, physics: Any) -> float: + """ + Returns a reward to the agent. + + Parameters: + ---------- + self : Any + Instance of the environment task + physics : Any + Physics object representing the state of the environment + + Returns: + ---------- + float + Computed reward for the agent. + """ + target_size = physics.named.model.geom_size['target', 0] + near_target = rewards.tolerance(physics.mass_to_target_dist(), + bounds=(0, target_size)) + control_reward = rewards.tolerance(physics.control(), margin=1, + value_at_margin=0, + sigmoid='quadratic').mean() + small_control = (control_reward + 4) / 5 + return near_target * small_control + + +def create_point_mass_and_control(orig_cwd: str = './', + device: str = "cpu", + sparse: bool = False) -> Tuple[Any, Dict[str, Any]]: + """ + Create the Point Mass environment and its associated controller. + + Parameters: + ---------- + orig_cwd : str, optional + Original current working directory (default is './') + device : str, optional + Device to run the environment on (default is "cpu") + sparse : bool, optional + Flag indicating whether to use sparse rewards (default is False) + + Returns: + ---------- + Tuple[Any, Dict[str, Any]] + Tuple containing the environment and the controller dictionary + """ + # create env + env = dmc2gym.make('point_mass', 'easy') + + # modify target (to create simple task) + if sparse: + env.env._env._task.get_reward = MethodType(new_get_reward, env.env._env._task) + + # env.env._env.physics.named.model.geom_size['target', 0] = 0.1 + # env.env._env._task._target_size = 0.1 + + path = os.path.join(orig_cwd, 'envs', 'point_mass', 'models') + + control_dict = { + "MediumSAC": { + "coord": None, + "local_expert": SACExpert(env, path, device) + }, + } + return env, control_dict \ No newline at end of file diff --git a/RLLG/envs/point_mass/local_expert_policy.py b/RLLG/envs/point_mass/local_expert_policy.py index 9aa58a0f..71eb0dd1 100644 --- a/RLLG/envs/point_mass/local_expert_policy.py +++ b/RLLG/envs/point_mass/local_expert_policy.py @@ -1,46 +1,75 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - - - -import numpy as np -import torch -import os - -class SACExpert: - - def __init__(self, env, path, device="cpu"): - - from agents.common.model import TanhGaussianPolicy, SamplerPolicy - # hyper-params - policy_arch = '64-64' - policy_log_std_multiplier = 1.0 - policy_log_std_offset = -1.0 - - # load expert policy - expert_policy = TanhGaussianPolicy( - env.observation_space.shape[0], - env.action_space.shape[0], - policy_arch, - log_std_multiplier=policy_log_std_multiplier, - log_std_offset=policy_log_std_offset, - ) - glob_path = os.path.join(path, 'medium_expert_sac') - - expert_policy.load_state_dict(torch.load(glob_path)) - expert_policy.to(device) - self.sampling_expert_policy = SamplerPolicy(expert_policy, device=device) - - def get_action(self, observation, init_action=None, env=None): - with torch.no_grad(): - expert_action = self.sampling_expert_policy( - np.expand_dims(observation, 0), deterministic=True - )[0, :] - return np.clip(expert_action, a_min=-0.99, a_max=0.99) # expert_action +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + + +from typing import Any, Optional +import numpy as np +import torch +import os + + +class SACExpert: + """ + Soft Actor-Critic (SAC) Expert. + + Parameters: + ---------- + env : Any + The environment (usually dm control env, could be gym as well or others). + path : str + The path to the model. + device : str, optional + The device to run the expert policy (default is 'cpu'). + """ + + def __init__(self, env: Any, path: str, device: Optional[str] = "cpu") -> None: + from agents.common.model import TanhGaussianPolicy, SamplerPolicy + # hyper-params + policy_arch = '64-64' + policy_log_std_multiplier = 1.0 + policy_log_std_offset = -1.0 + + # load expert policy + expert_policy = TanhGaussianPolicy( + env.observation_space.shape[0], + env.action_space.shape[0], + policy_arch, + log_std_multiplier=policy_log_std_multiplier, + log_std_offset=policy_log_std_offset, + ) + glob_path = os.path.join(path, 'medium_expert_sac') + + expert_policy.load_state_dict(torch.load(glob_path)) + expert_policy.to(device) + self.sampling_expert_policy = SamplerPolicy(expert_policy, device=device) + + def get_action(self, observation: np.ndarray, init_action: Any = None, env: Any = None) -> np.ndarray: + """ + Get an action from the SAC expert policy. + + Parameters: + ---------- + observation : numpy.ndarray + The observation from the environment. + init_action : Any, optional + Initial action (default is None). + env : gym.Env, optional + The environment (default is None). + + Returns: + ---------- + numpy.ndarray + The clipped expert action. + """ + with torch.no_grad(): + expert_action = self.sampling_expert_policy( + np.expand_dims(observation, 0), deterministic=True + )[0, :] + return np.clip(expert_action, a_min=-0.99, a_max=0.99) # expert_action diff --git a/RLLG/main.py b/RLLG/main.py index eb2c50fa..da6c283e 100644 --- a/RLLG/main.py +++ b/RLLG/main.py @@ -1,129 +1,131 @@ -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. - -# This program is free software; you can redistribute it and/or modify it under -# the terms of the MIT license. - -# This program is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. See the MIT License for more details. - - -import os -import numpy as np -import yaml -from ray import tune -from ray.tune import run - -from agents.common.utils import get_global_name, get_global_agent_name -from sac_main_fn import main as sac_main - -os.environ["Timer"] = '1' - - -def trial_name(trial, hp_to_write): - ti = 'repeat_run' - identifier = ','.join([f'{hp}={trial.config[hp]}' for hp in hp_to_write]) + \ - f',trial={trial.config[ti]},id={trial.trial_id}' - return identifier - - -if __name__ == '__main__': - - envs = ['ball_in_cup'] - agents = [ - 'SAC', - # 'SAG', - # 'PIG', - # 'PAG', - ] - nb_local_experts = 'simple' - - for env in envs: - - glob_name = get_global_name(env) - - with open(os.path.join(os.getcwd(), 'ray_config', f'{glob_name}_cfg.yaml')) as f: - config = yaml.safe_load(f) - np.random.seed(config['seed']) - del config['seed'] - - config['orig_cwd'] = os.getcwd() - config['env'] = env - config['glob_name'] = glob_name - config['device'] = 'cpu' - - # get some hyperparms and remove them from dict - expert_names = config['local_experts'] - del config['local_experts'] - dict_pos_tol = None - if 'pos_tol' in config: - dict_pos_tol = config['pos_tol'] - del config['pos_tol'] - dict_beta = config['beta'] - dict_delta = config['delta'] - dict_phi = config['phi'] - del config['beta'] - del config['delta'] - del config['phi'] - decay_parameter_list = config['decay_parameter'] - del config['decay_parameter'] - - for expert in expert_names: - - config['expert'] = expert - - for agent_name in agents: - - # agent name - glob_agent_name = get_global_agent_name(agent_name) - config['agent_name'] = agent_name - - # get hyperparameters - if dict_pos_tol is not None: - config['pos_tol'] = dict_pos_tol[agent_name] - config['beta'] = dict_beta[agent_name] - config['delta'] = dict_delta[agent_name] - config['phi'] = dict_phi[agent_name] - - # decay or not - agent_name_to_show = agent_name - if agent_name in ['SAC', 'SAG', 'NaiveSAG']: - decay_parameter_list = [ False ] - - for decay_parameter in decay_parameter_list: - - config['decay_parameter'] = decay_parameter - if decay_parameter: - agent_name_to_show = 'Decreased' + agent_name_to_show - else: - # to avoid unecessary runs - config['delta'] = [ 1 ] - - # ray preparation - hps = [k for k, v in config.items() if type(v) is list] - config_ray = config.copy() - config_ray = {k: tune.grid_search(v) if type(v) is list else v for k, v in config.items()} - config_ray['repeat_run'] = tune.grid_search(list(range(config['repeat_run']))) - metric_columns = ['epoch', 'average_return', 'mean_avg_return', 'epoch_time'] - reporter = tune.CLIReporter(parameter_columns=hps, metric_columns=metric_columns) - - env_name_folder = env - - if agent_name in ['SAC']: - save_path = f'./ray_results_test/{env_name_folder}/{agent_name_to_show}' - else: - save_path = f'./ray_results_test/{env_name_folder}/{agent_name_to_show}/{expert}' - - analysis = run( - sac_main, - config=config_ray, - metric=config_ray['metric'], - mode=config_ray['mode'], - resources_per_trial={"cpu": 1, "gpu": 1 if config_ray['device'] == 'cuda' else 0}, - max_concurrent_trials=15, - log_to_file=True, - local_dir=save_path, - trial_name_creator=lambda t: trial_name(t, hps), - trial_dirname_creator=lambda t: trial_name(t, hps), - progress_reporter=reporter, - verbose=1) # resume=True, +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + +# This program is free software; you can redistribute it and/or modify it under +# the terms of the MIT license. + +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the MIT License for more details. + + +from typing import List +import os +import numpy as np +import yaml +from ray import tune +from ray.tune import run, Experiment + +from agents.common.utils import get_global_name, get_global_agent_name +from agents.common.config import process_glob_config, process_config_per_agent +from sac_main_fn import main as sac_main + +os.environ["Timer"] = '1' + + +def trial_name(trial: Experiment, hp_to_write: List[str]) -> str: + """ + Generate a unique identifier for a trial based on specified hyperparameters and trial information. + + Parameters: + ---------- + trial : ray.tune.Experiment + The Ray Tune Experiment for which to generate the identifier. + hp_to_write : List[str] + List of hyperparameter names to include in the identifier. + + Returns: + ---------- + str + The generated trial identifier. + """ + ti = 'repeat_run' + identifier = ','.join([f'{hp}={trial.config[hp]}' for hp in hp_to_write]) + \ + f',trial={trial.config[ti]},id={trial.trial_id}' + return identifier + + +if __name__ == '__main__': + + envs = ['ball_in_cup'] + agents = [ + 'SAC', + 'SAG', + 'PIG', + 'PAG', + ] + + for env in envs: + # get global name to retrieve configs + glob_name = get_global_name(env) + + # retrieve config + with open(os.path.join(os.getcwd(), 'ray_config', f'{glob_name}_cfg.yaml')) as f: + config = yaml.safe_load(f) + np.random.seed(config['seed']) + del config['seed'] + + # add important elements to the config file + config['orig_cwd'] = os.getcwd() + config['env'] = env + config['glob_name'] = glob_name + config['device'] = 'cpu' + + # process config and retrieve elements for loops + expert_names, dict_pos_tol, dict_beta, dict_delta, dict_phi, decay_parameter_list = process_glob_config(config) + + # loop over experts (if multiple experts) + for expert in expert_names: + + config['expert'] = expert + + # loop over agents (if multiple agents) + for agent_name in agents: + + # agent name + glob_agent_name = get_global_agent_name(agent_name) + config['agent_name'] = agent_name + + # further process hyperparamers to make them dependent on agent + process_config_per_agent(config, agent_name, dict_beta, dict_delta, dict_phi, dict_pos_tol) + + # decay or not (only relevant for PAG) + agent_name_to_show = agent_name + if agent_name in ['SAC', 'SAG', 'NaiveSAG']: + decay_parameter_list = [False] + + for decay_parameter in decay_parameter_list: + + # to avoid unecessary runs + config['decay_parameter'] = decay_parameter + if decay_parameter: + agent_name_to_show = 'Decreased' + agent_name_to_show + else: + # to avoid unecessary runs + config['delta'] = [1] + + # ray preparation + hps = [k for k, v in config.items() if type(v) is list] + config_ray = config.copy() + config_ray = {k: tune.grid_search(v) if type(v) is list else v for k, v in config.items()} + config_ray['repeat_run'] = tune.grid_search(list(range(config['repeat_run']))) + metric_columns = ['epoch', 'average_return', 'mean_avg_return', 'epoch_time'] + reporter = tune.CLIReporter(parameter_columns=hps, metric_columns=metric_columns) + env_name_folder = env + if agent_name in ['SAC']: + save_path = f'./ray_results_test/{env_name_folder}/{agent_name_to_show}' + else: + save_path = f'./ray_results_test/{env_name_folder}/{agent_name_to_show}/{expert}' + + analysis = run( + sac_main, + config=config_ray, + metric=config_ray['metric'], + mode=config_ray['mode'], + resources_per_trial={"cpu": 1, "gpu": 1 if config_ray['device'] == 'cuda' else 0}, + max_concurrent_trials=15, + log_to_file=True, + local_dir=save_path, + trial_name_creator=lambda t: trial_name(t, hps), + trial_dirname_creator=lambda t: trial_name(t, hps), + progress_reporter=reporter, + verbose=1) # resume=True, diff --git a/RLLG/notebooks/Visualization.ipynb b/RLLG/notebooks/Visualization.ipynb index b124a92c..1c4fce5f 100644 --- a/RLLG/notebooks/Visualization.ipynb +++ b/RLLG/notebooks/Visualization.ipynb @@ -1,155 +1,155 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from IPython.core.display import display, HTML\n", - "display(HTML(\"\"))" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "from ray.tune import ExperimentAnalysis\n", - "\n", - "import sys\n", - "sys.path.append('../')\n", - "\n", - "from helpers import plot_all\n", - "\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Import results" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "env = \"ball_in_cup\"\n", - "\n", - "agents = [\n", - " \"SAC\",\n", - " \"SAG\",\n", - " \"PIG\",\n", - " \"PAG\"\n", - "]\n", - "\n", - "experts = [\n", - " \"MediumSAC\",\n", - "]\n", - "\n", - "hps = [\n", - " \"activation_fn\",\n", - " \"betas\",\n", - " \"decay_rate\"\n", - "]\n", - "\n", - "metric = \"mean_avg_return\"\n", - "mode = \"max\"\n", - "n_epochs = 2000" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "plt.figure(figsize=(8, 6))\n", - "\n", - "plot_all(env, \n", - " agents, \n", - " experts, \n", - " rolling_mean=0.05,\n", - " init_path=\"..\", \n", - " hps=hps, \n", - " set_hyperparam={'pos_tol': 1.7},\n", - " chosen_max=2000,\n", - " n_epochs=2000,\n", - " metric=\"mean_avg_return\", \n", - " mode=\"max\", \n", - " to_plot=\"final\")\n", - "\n", - "plt.xlabel(\"epochs (one epoch = one episode = 1000 steps)\")\n", - "plt.ylabel(\"Average return over 5 seeds\")\n", - "plt.title(\"Ball in Cup\")\n", - "\n", - "plt.show()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 1 +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from ray.tune import ExperimentAnalysis\n", + "\n", + "import sys\n", + "sys.path.append('../')\n", + "\n", + "from helpers import plot_all\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Import results" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "env = \"ball_in_cup\"\n", + "\n", + "agents = [\n", + " \"SAC\",\n", + " \"SAG\",\n", + " \"PIG\",\n", + " \"PAG\"\n", + "]\n", + "\n", + "experts = [\n", + " \"MediumSAC\",\n", + "]\n", + "\n", + "hps = [\n", + " \"activation_fn\",\n", + " \"betas\",\n", + " \"decay_rate\"\n", + "]\n", + "\n", + "metric = \"mean_avg_return\"\n", + "mode = \"max\"\n", + "n_epochs = 2000" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "plt.figure(figsize=(8, 6))\n", + "\n", + "plot_all(env, \n", + " agents, \n", + " experts, \n", + " rolling_mean=0.05,\n", + " init_path=\"..\", \n", + " hps=hps, \n", + " set_hyperparam={'pos_tol': 1.7},\n", + " chosen_max=2000,\n", + " n_epochs=2000,\n", + " metric=\"mean_avg_return\", \n", + " mode=\"max\", \n", + " to_plot=\"final\")\n", + "\n", + "plt.xlabel(\"epochs (one epoch = one episode = 1000 steps)\")\n", + "plt.ylabel(\"Average return over 5 seeds\")\n", + "plt.title(\"Ball in Cup\")\n", + "\n", + "plt.show()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 } \ No newline at end of file diff --git a/RLLG/notebooks/helpers.py b/RLLG/notebooks/helpers.py index efbbff31..fad973a6 100644 --- a/RLLG/notebooks/helpers.py +++ b/RLLG/notebooks/helpers.py @@ -4,28 +4,53 @@ import matplotlib.pyplot as plt from ray.tune import ExperimentAnalysis from scipy.integrate import simps +from typing import Union, Any, Dict, List, Optional, Tuple -def plot_curves(analysis, - hps, - metric, - rolling_mean=0.6, - set_hyperparam={}, - hyperparam_comparison=None, - to_plot="final", - label="SAC", - chosen_max=1000, - n_epochs=2000, - retrieve_auc=False): +def plot_curves(analysis: tune.ray.ExperimentAnalysis, + hps: List[str], + metric: str, + rolling_mean: float = 0.6, + set_hyperparam: Dict[str, Any] = {}, + hyperparam_comparison: Optional[str] = None, + to_plot: Optional[str] = "final", + label: Optional[str] = "SAC", + chosen_max: Optional[int] = 1000, + n_epochs: Optional[int] = 2000, + retrieve_auc: Optional[bool] = False) -> Optional[Dict[str, Union[float, float]]]: + """ - analysis: - tune.ray.ExperimentAnalysis - hps: hyperparams to choose - list - metric: - str - to_plot: to plot best final mean or best overall - str: choose between final and overall + Plot learning curves based on the specified analysis, hyperparameters, and metric. + + Parameters: + ---------- + analysis : tune.ray.ExperimentAnalysis + The ray tune analysis object containing information about the experiment + hps : List[str] + List of hyperparameters to choose for plotting + metric : str + The metric to be plotted + rolling_mean : float, optional + The alpha value for exponential weighted moving average (default is 0.6). + set_hyperparam : Dict[str, Any], optional + Dictionary specifying hyperparameters and their values to set during plotting (default is an empty dictionary). + hyperparam_comparison : str, optional + String specifying the hyperparameters to compare during plotting (default is None). + to_plot : str, optional + String specifying whether to plot the best final mean or the best overall (default is "final"). + label : str, optional + Label for the plot (default is "SAC"). + chosen_max : int, optional + The chosen maximum value for the metric (default is 1000). + n_epochs : int, optional + The number of epochs for plotting (default is 2000). + retrieve_auc : bool, optional + Boolean indicating whether to retrieve the area under the curve (AUC) and final performance (default is False). + + Returns: + ---------- + Optional[Dict[str, Union[float, float]]] + A dictionary containing AUC and final performance if retrieve_auc is True, otherwise None. """ group_by = [f'config/{hp}' for hp in hps if hp != 'repeat_run'] + ['epoch'] dfs = analysis.trial_dataframes @@ -154,37 +179,59 @@ def plot_curves(analysis, } -def plot_all(env, - agents, - experts, - rolling_mean=0.6, - set_hyperparam={}, - hyperparam_comparison=None, - init_path="..", - hps=['betas'], - metric="mean_avg_return", - mode="max", - to_plot="final", - chosen_max=1000, - n_epochs=2000, - retrieve_auc=False): +def plot_all(env: str, + agents: List[str], + experts: List[str], + rolling_mean: float = 0.6, + set_hyperparam: Dict[str, Any] = {}, + hyperparam_comparison: Optional[str] = None, + init_path: Optional[str] = "..", + hps: Optional[List[str]] = ['betas'], + metric: Optional[str] = "mean_avg_return", + mode: Optional[str] = "max", + to_plot: Optional[str] = "final", + chosen_max: Optional[int] = 1000, + n_epochs: Optional[int] = 2000, + retrieve_auc: Optional[bool] = False) -> Optional[Dict[str, Union[float, float]]]: """ - env: - str - agents: - list of str - init_path: - str - hps: hyperparams to choose - list - metric: - str - mode: - str - to_plot: to plot best final mean or best overall - str: choose between final and overall - n_epochs: - int + Plot learning curves for multiple agents and experts based on the specified environment. + + Parameters: + ---------- + env : str + The environment for which learning curves will be plotted + agents : List[str] + List of agent names to be included in the plot + experts : List[str] + List of expert names to be included in the plot + rolling_mean : float, optional + The alpha value for exponential weighted moving average (default is 0.6) + set_hyperparam : Dict[str, Any], optional + Dictionary specifying hyperparameters and their values to set during plotting (default is an empty dictionary) + hyperparam_comparison : str, optional + String specifying the hyperparameters to compare during plotting (default is None) + init_path : str, optional + The initial path where ray_results are stored (default is "..") + hps : List[str], optional + List of hyperparameters to choose for plotting (default is ['betas']) + metric : str, optional + The metric to be plotted (default is "mean_avg_return") + mode : str, optional + The mode for selecting the best values (default is "max") + to_plot : str, optional + String specifying whether to plot the best final mean or the best overall (default is "final") + chosen_max : int, optional + The chosen maximum value for the metric (default is 1000) + n_epochs : int, optional + The number of epochs for plotting (default is 2000) + retrieve_auc : bool, optional + Boolean indicating whether to retrieve the area under the curve (AUC) and final performance (default is False) + + Returns: + ---------- + Optional[Dict[str, Union[float, float]]] + A dictionary containing AUC and final performance for each agent-expert combination if retrieve_auc is True, + otherwise None. """ assert to_plot in ["overall", "final"] diff --git a/RLLG/notebooks/video_fn.py b/RLLG/notebooks/video_fn.py index 764c661a..1e307adb 100644 --- a/RLLG/notebooks/video_fn.py +++ b/RLLG/notebooks/video_fn.py @@ -6,9 +6,25 @@ from IPython import display from dm_control.utils import rewards as rewards_fn from dmc2gym.wrappers import _flatten_obs +from typing import Union, Any, Dict, List, Optional, Tuple -def grabFrame(env): +def grabFrame(env: Any) -> np.ndarray: + """ + Capture and return a frame from the dm_control environment rendering. + + Parameters: + ---------- + env : dm_control suite env + The dm control suite environment + + Returns: + ---------- + np.ndarray + A NumPy array representing the RGB frame captured from the environment rendering + + """ + # Get RGB ren # Get RGB rendering of env rgbArr = env.physics.render(480, 600, camera_id=0) # Convert to BGR for use with OpenCV @@ -16,7 +32,23 @@ def grabFrame(env): # Use 'jpeg' instead of 'png' (~5 times faster) -def array_to_image(a, fmt='jpeg'): +def array_to_image(a: np.ndarray, fmt: Optional[str] = 'jpeg') -> display.Image: + """ + Convert a NumPy array to an image and display it using IPython's display module. + + Parameters: + ---------- + a : numpy.ndarray + The input NumPy array representing an image + fmt : str, optional + The image format to use (default is 'jpeg') + + Returns: + ---------- + IPython.display.Image + An IPython Image object representing the displayed image + + """ # Create binary stream object f = BytesIO() @@ -25,8 +57,32 @@ def array_to_image(a, fmt='jpeg'): return display.Image(data=f.getvalue()) +def create_dm_video(env: Any, + policy: str = "random", + verbose: int = 0, + video_name: str = "video.mp4", + not_plot: bool = False) -> None: + """ + Create a video of an episode in a dm_control environment. -def create_dm_video(env, policy="random", verbose=0, video_name="video.mp4", not_plot=False): + Parameters: + ---------- + env : Any + The dm_control environment + policy : str or callable, optional + The policy used to generate actions. If "random", random actions are used. + If a callable, it should take an observation and return an action. + verbose : int, optional + Verbosity level. If greater than 0, print additional information during video creation. + video_name : str, optional + The name of the output video file (default is "video.mp4"). + not_plot : bool, optional + If True, do not plot (default is False). + + Returns: + ---------- + None + """ frame = grabFrame(env) height, width, layers = frame.shape if not not_plot: @@ -77,7 +133,23 @@ def create_dm_video(env, policy="random", verbose=0, video_name="video.mp4", not video.release() -def plot_video(d, d2, video_name="video.mp4"): +def plot_video(d: Dict, d2: Dict, video_name: str = "video.mp4") -> None: + """ + Plot a video usind d and d2 for display. + + Parameters: + ---------- + d : Any + The dictionary used to update the video frames. + d2 : Any + The dictionary used to update the display with additional information (e.g., FPS). + video_name : str, optional + The name of the input video file (default is "video.mp4"). + + Returns: + ---------- + None + """ cap = cv2.VideoCapture(video_name) while (cap.isOpened()): t1 = time.time() @@ -93,13 +165,38 @@ def plot_video(d, d2, video_name="video.mp4"): cap.release() -def plot_total_video(env, d, d2, policy="random", verbose=0, video_name="video.mp4", not_plot=False): - """ - Note this function requires d and d2. They must be created with the following in the Jupyter Notebook: - >>> d = display.display("", display_id=1) - >>> d2 = display.display("", display_id=2) +def plot_total_video(env: Any, + d: dict, + d2: dict, + policy: str = "random", + verbose: int = 0, + video_name: str = "video.mp4", + not_plot: bool = False) -> None: """ + Plot a total video using dictionaries for display. + + Parameters: + ---------- + env : type + Description of parameter `env`. + d : dict + The dictionary used to update the video frames. + d2 : dict + The dictionary used to update the display with additional information (e.g., FPS) + policy : str, optional + The policy to be used (default is "random") + verbose : int, optional + Verbosity level (default is 0). + video_name : str, optional + The name of the output video file (default is "video.mp4") + not_plot : bool, optional + If True, do not plot the video (default is False) + Returns: + ---------- + None + + """ create_dm_video(env, policy=policy, verbose=verbose, video_name=video_name, not_plot=not_plot) if not not_plot: - plot_video(d, d2, video_name=video_name) \ No newline at end of file + plot_video(d, d2, video_name=video_name) diff --git a/RLLG/ray_config/ball_in_cup_cfg.yaml b/RLLG/ray_config/ball_in_cup_cfg.yaml index 0b6407b3..b21eddca 100644 --- a/RLLG/ray_config/ball_in_cup_cfg.yaml +++ b/RLLG/ray_config/ball_in_cup_cfg.yaml @@ -1,65 +1,65 @@ -# RL env and common variables -repeat_run: 5 -max_traj_length: 1000 -replay_buffer_size: 100000 -seed: 42 -network_arch: '64-64' -policy_arch: '64-64' -qf_arch: '64-64' -policy_log_std_multiplier: 1.0 -policy_log_std_offset: -1.0 -n_epochs: 1000 -n_initial_env_steps: 1000 -n_env_steps_per_epoch: 1000 -n_train_step_per_epoch: 1000 -eval_period: 1 -eval_n_trajs: 5 -batch_size: 256 -discount: 0.99 -use_automatic_entropy_tuning: True -alpha_multiplier: 1.0 -backup_entropy: True -target_entropy: 0.0 -policy_lr: 3.0e-4 -qf_lr: 3.0e-4 -optimizer_type: 'adam' -soft_target_update_rate: 0.005 # 5e-3 -target_update_period: 1 - -# hyperparams for stabilization -activation_fn: 'relu' - -# for improved switched sac -use_automatic_entropy_tuning_parametrized_perturbation: True -expert_alpha_multiplier: 1.0 - -# Local experts variables -local_experts: - - 'MediumSAC' - -beta: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 1.0 ] # [ 0.5, 1, 2, 5 ] - PAG: [ 0.0 ] - -decay_parameter: - - False -delta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 0.9 ] # [ 0.5, 0.8, 0.9 ] - PAG: [ 1.0 ] - -phi: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 0.0 ] - PAG: [ 0.8 ] # [ 0.5, 0.8, 1.0, 1.5 ] - -# Ray variables -metric: 'mean_avg_return' -mode: 'max' - -# Save policy -num_epoch_save: 50 +# RL env and common variables +repeat_run: 5 +max_traj_length: 1000 +replay_buffer_size: 100000 +seed: 42 +network_arch: '64-64' +policy_arch: '64-64' +qf_arch: '64-64' +policy_log_std_multiplier: 1.0 +policy_log_std_offset: -1.0 +n_epochs: 1 # 1000 +n_initial_env_steps: 10 +n_env_steps_per_epoch: 10 # 1000 +n_train_step_per_epoch: 10 # 1000 +eval_period: 1 +eval_n_trajs: 5 +batch_size: 256 +discount: 0.99 +use_automatic_entropy_tuning: True +alpha_multiplier: 1.0 +backup_entropy: True +target_entropy: 0.0 +policy_lr: 3.0e-4 +qf_lr: 3.0e-4 +optimizer_type: 'adam' +soft_target_update_rate: 0.005 # 5e-3 +target_update_period: 1 + +# hyperparams for stabilization +activation_fn: 'relu' + +# for improved switched sac +use_automatic_entropy_tuning_parametrized_perturbation: True +expert_alpha_multiplier: 1.0 + +# Local experts variables +local_experts: + - 'MediumSAC' + +beta: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 1.0 ] # [ 0.5, 1, 2, 5 ] + PAG: [ 0.0 ] + +decay_parameter: + - False +delta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 0.9 ] # [ 0.5, 0.8, 0.9 ] + PAG: [ 1.0 ] + +phi: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 0.0 ] + PAG: [ 0.8 ] # [ 0.5, 0.8, 1.0, 1.5 ] + +# Ray variables +metric: 'mean_avg_return' +mode: 'max' + +# Save policy +num_epoch_save: 50 diff --git a/RLLG/ray_config/bullet_small_reach_cfg.yaml b/RLLG/ray_config/bullet_small_reach_cfg.yaml index d61282e5..20b4c76f 100644 --- a/RLLG/ray_config/bullet_small_reach_cfg.yaml +++ b/RLLG/ray_config/bullet_small_reach_cfg.yaml @@ -1,71 +1,71 @@ -# RL env and common variables -repeat_run: 5 -max_traj_length: 1000 -replay_buffer_size: 1000000 -seed: 42 -network_arch: '64-64' -policy_arch: '64-64' -qf_arch: '64-64' -policy_log_std_multiplier: 1.0 -policy_log_std_offset: -1.0 -n_epochs: 2001 -n_initial_env_steps: 1000 -n_env_steps_per_epoch: 1000 -n_train_step_per_epoch: 1000 -eval_period: 1 -eval_n_trajs: 5 -batch_size: 256 -discount: 0.99 -use_automatic_entropy_tuning: True -alpha_multiplier: 1.0 -backup_entropy: True -target_entropy: 0.0 -policy_lr: 3.0e-4 -qf_lr: 3.0e-4 -optimizer_type: 'adam' -soft_target_update_rate: 0.005 # 5e-3 -target_update_period: 1 - -# hyperparams for stabilization -activation_fn: 'relu' - -# for improved switched sac -use_automatic_entropy_tuning_parametrized_perturbation: True -expert_alpha_multiplier: 1.0 - -# Local experts variables -local_experts: - - 'SafeScripted' - -pos_tol_choices: - SAC: [ 2.0 ] - SAG: [ 2.0 ] - PIG: [ 2.0 ] - PAG: [ 2.0 ] - -beta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 0.5, 1, 2, 5 ] - PAG: [ 1.0 ] - -decay_parameter: - - False -delta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 1.0 ] - PAG: [ 0.5, 0.9 ] - -phi: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 1.0 ] - PAG: [ 0.2, 0.6, 0.8 ] - -# Ray variables -metric: 'mean_avg_return' -mode: 'max' - -# Save policy -num_epoch_save: 10000 +# RL env and common variables +repeat_run: 5 +max_traj_length: 1000 +replay_buffer_size: 1000000 +seed: 42 +network_arch: '64-64' +policy_arch: '64-64' +qf_arch: '64-64' +policy_log_std_multiplier: 1.0 +policy_log_std_offset: -1.0 +n_epochs: 2001 +n_initial_env_steps: 1000 +n_env_steps_per_epoch: 1000 +n_train_step_per_epoch: 1000 +eval_period: 1 +eval_n_trajs: 5 +batch_size: 256 +discount: 0.99 +use_automatic_entropy_tuning: True +alpha_multiplier: 1.0 +backup_entropy: True +target_entropy: 0.0 +policy_lr: 3.0e-4 +qf_lr: 3.0e-4 +optimizer_type: 'adam' +soft_target_update_rate: 0.005 # 5e-3 +target_update_period: 1 + +# hyperparams for stabilization +activation_fn: 'relu' + +# for improved switched sac +use_automatic_entropy_tuning_parametrized_perturbation: True +expert_alpha_multiplier: 1.0 + +# Local experts variables +local_experts: + - 'SafeScripted' + +pos_tol_choices: + SAC: [ 2.0 ] + SAG: [ 2.0 ] + PIG: [ 2.0 ] + PAG: [ 2.0 ] + +beta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 0.5, 1, 2, 5 ] + PAG: [ 1.0 ] + +decay_parameter: + - False +delta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 1.0 ] + PAG: [ 0.5, 0.9 ] + +phi: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 1.0 ] + PAG: [ 0.2, 0.6, 0.8 ] + +# Ray variables +metric: 'mean_avg_return' +mode: 'max' + +# Save policy +num_epoch_save: 10000 diff --git a/RLLG/ray_config/cartpole_cfg.yaml b/RLLG/ray_config/cartpole_cfg.yaml index 5c52434a..6e87228c 100644 --- a/RLLG/ray_config/cartpole_cfg.yaml +++ b/RLLG/ray_config/cartpole_cfg.yaml @@ -1,81 +1,81 @@ -# RL env and common variables -repeat_run: 5 -max_traj_length: 1000 -replay_buffer_size: 1000000 -seed: 42 -network_arch: '32-32' -policy_arch: '32-32' -qf_arch: '32-32' -policy_log_std_multiplier: 1.0 -policy_log_std_offset: -1.0 -n_epochs: 2001 -n_initial_env_steps: 1000 -n_env_steps_per_epoch: 1000 -n_train_step_per_epoch: 1000 -eval_period: 1 -eval_n_trajs: 5 -batch_size: 256 -discount: 0.99 -use_automatic_entropy_tuning: True -alpha_multiplier: 1.0 -backup_entropy: True -target_entropy: 0.0 -policy_lr: 3.0e-4 -qf_lr: 3.0e-4 -optimizer_type: 'adam' -soft_target_update_rate: 0.005 # 5e-3 -target_update_period: 1 - -# hyperparams for stabilization -activation_fn: 'tanh' - -# for improved switched sac -use_automatic_entropy_tuning_parametrized_perturbation: True -expert_alpha_multiplier: 1.0 - -# safe cartpole hyperparms -limit_cart: 1.9 -reward_end: - - 1000 - -# SAC changes -use_success_buffer: False -ratio_success: - - 0.0 - -# Local experts variables -local_experts: - - 'SafeScripted' - -pos_tol_choices: - SAC: [ 1.7 ] - SAG: [ 1.7 ] - PIG: [ 1.7 ] - PAG: [ 1.7 ] - -beta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 0.5, 1, 2, 5 ] - PAG: [ 1.0 ] - -decay_parameter: - - True -decay_rate: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 1.0 ] - PAG: [ 0.7, 0.8, 0.9 ] - -phi: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 1.0 ] - PAG: [ 0.5, 0.7, 0.9 ] - -# Ray variables -metric: 'mean_avg_return' -mode: 'max' - -# Save policy -num_epoch_save: 1000 +# RL env and common variables +repeat_run: 5 +max_traj_length: 1000 +replay_buffer_size: 1000000 +seed: 42 +network_arch: '32-32' +policy_arch: '32-32' +qf_arch: '32-32' +policy_log_std_multiplier: 1.0 +policy_log_std_offset: -1.0 +n_epochs: 2001 +n_initial_env_steps: 1000 +n_env_steps_per_epoch: 1000 +n_train_step_per_epoch: 1000 +eval_period: 1 +eval_n_trajs: 5 +batch_size: 256 +discount: 0.99 +use_automatic_entropy_tuning: True +alpha_multiplier: 1.0 +backup_entropy: True +target_entropy: 0.0 +policy_lr: 3.0e-4 +qf_lr: 3.0e-4 +optimizer_type: 'adam' +soft_target_update_rate: 0.005 # 5e-3 +target_update_period: 1 + +# hyperparams for stabilization +activation_fn: 'tanh' + +# for improved switched sac +use_automatic_entropy_tuning_parametrized_perturbation: True +expert_alpha_multiplier: 1.0 + +# safe cartpole hyperparms +limit_cart: 1.9 +reward_end: + - 1000 + +# SAC changes +use_success_buffer: False +ratio_success: + - 0.0 + +# Local experts variables +local_experts: + - 'SafeScripted' + +pos_tol_choices: + SAC: [ 1.7 ] + SAG: [ 1.7 ] + PIG: [ 1.7 ] + PAG: [ 1.7 ] + +beta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 0.5, 1, 2, 5 ] + PAG: [ 1.0 ] + +decay_parameter: + - True +decay_rate: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 1.0 ] + PAG: [ 0.7, 0.8, 0.9 ] + +phi: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 1.0 ] + PAG: [ 0.5, 0.7, 0.9 ] + +# Ray variables +metric: 'mean_avg_return' +mode: 'max' + +# Save policy +num_epoch_save: 1000 diff --git a/RLLG/ray_config/hirl_point_fall_cfg.yaml b/RLLG/ray_config/hirl_point_fall_cfg.yaml index 808c6cc7..61075770 100644 --- a/RLLG/ray_config/hirl_point_fall_cfg.yaml +++ b/RLLG/ray_config/hirl_point_fall_cfg.yaml @@ -1,66 +1,66 @@ -# RL env and common variables -repeat_run: 5 -max_traj_length: 1000 -replay_buffer_size: 1000000 -seed: 42 -network_arch: '32-32' -policy_arch: '32-32' -qf_arch: '32-32' -policy_log_std_multiplier: 1.0 -policy_log_std_offset: -1.0 -n_epochs: 2001 -n_initial_env_steps: 1000 -n_env_steps_per_epoch: 1000 -n_train_step_per_epoch: 1000 -eval_period: 1 -eval_n_trajs: 5 -batch_size: 256 -discount: 0.99 -use_automatic_entropy_tuning: True -alpha_multiplier: 1.0 -backup_entropy: True -target_entropy: 0.0 -policy_lr: 3.0e-4 -qf_lr: 3.0e-4 -optimizer_type: 'adam' -soft_target_update_rate: 5.0e-3 -target_update_period: 1 - -# hyperparams for stabilization -activation_fn: 'relu' - -# for improved switched sac -use_automatic_entropy_tuning_parametrized_perturbation: True -expert_alpha_multiplier: 1.0 - -# Local experts variables -local_experts: - - 'MediumSAC' - -beta: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 0.5, 1, 2, 5 ] - PAG: [ 0.0 ] - -decay_parameter: - - False -delta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 0.5, 0.8, 0.9 ] - PAG: [ 1.0 ] - -phi: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 0.0 ] - PAG: [ 0.5, 0.8, 1.0, 1.5 ] - -# Ray variables -metric: 'mean_avg_return' -mode: 'max' - -# Save policy -num_epoch_save: 50 - +# RL env and common variables +repeat_run: 5 +max_traj_length: 1000 +replay_buffer_size: 1000000 +seed: 42 +network_arch: '32-32' +policy_arch: '32-32' +qf_arch: '32-32' +policy_log_std_multiplier: 1.0 +policy_log_std_offset: -1.0 +n_epochs: 2001 +n_initial_env_steps: 1000 +n_env_steps_per_epoch: 1000 +n_train_step_per_epoch: 1000 +eval_period: 1 +eval_n_trajs: 5 +batch_size: 256 +discount: 0.99 +use_automatic_entropy_tuning: True +alpha_multiplier: 1.0 +backup_entropy: True +target_entropy: 0.0 +policy_lr: 3.0e-4 +qf_lr: 3.0e-4 +optimizer_type: 'adam' +soft_target_update_rate: 5.0e-3 +target_update_period: 1 + +# hyperparams for stabilization +activation_fn: 'relu' + +# for improved switched sac +use_automatic_entropy_tuning_parametrized_perturbation: True +expert_alpha_multiplier: 1.0 + +# Local experts variables +local_experts: + - 'MediumSAC' + +beta: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 0.5, 1, 2, 5 ] + PAG: [ 0.0 ] + +decay_parameter: + - False +delta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 0.5, 0.8, 0.9 ] + PAG: [ 1.0 ] + +phi: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 0.0 ] + PAG: [ 0.5, 0.8, 1.0, 1.5 ] + +# Ray variables +metric: 'mean_avg_return' +mode: 'max' + +# Save policy +num_epoch_save: 50 + diff --git a/RLLG/ray_config/point_circle_cfg.yaml b/RLLG/ray_config/point_circle_cfg.yaml index 1ff41c11..c8768b9c 100644 --- a/RLLG/ray_config/point_circle_cfg.yaml +++ b/RLLG/ray_config/point_circle_cfg.yaml @@ -1,71 +1,71 @@ -# RL env and common variables -repeat_run: 5 -max_traj_length: 1000 -replay_buffer_size: 100000 -seed: 42 -network_arch: '64-64' -policy_arch: '64-64' -qf_arch: '64-64' -policy_log_std_multiplier: 1.0 -policy_log_std_offset: -1.0 -n_epochs: 2001 -n_initial_env_steps: 1000 -n_env_steps_per_epoch: 1000 -n_train_step_per_epoch: 1000 -eval_period: 1 -eval_n_trajs: 5 -batch_size: 256 -discount: 0.99 -use_automatic_entropy_tuning: True -alpha_multiplier: 1.0 -backup_entropy: True -target_entropy: 0.0 -policy_lr: 3.0e-4 -qf_lr: 3.0e-4 -optimizer_type: 'adam' -soft_target_update_rate: 0.005 # 5e-3 -target_update_period: 1 - -# hyperparams for stabilization -activation_fn: 'relu' - -# for improved switched sac -use_automatic_entropy_tuning_parametrized_perturbation: True -expert_alpha_multiplier: 1.0 - -# Local experts variables -local_experts: - - 'SafeScripted' - -pos_tol_choices: - SAC: [ 0.5 ] - SAG: [ 0.5 ] - PIG: [ 0.5 ] - PAG: [ 0.5 ] - -beta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 0.5, 1, 2, 5 ] - PAG: [ 1.0 ] - -decay_parameter: - - True -delta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 1.0 ] - PAG: [ 0.5, 0.8, 0.9 ] - -phi: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 0.0 ] - PAG: [ 0.1, 0.3, 0.5, 1.0 ] - -# Ray variables -metric: 'mean_avg_return' -mode: 'max' - -# Save policy -num_epoch_save: 5000 +# RL env and common variables +repeat_run: 5 +max_traj_length: 1000 +replay_buffer_size: 100000 +seed: 42 +network_arch: '64-64' +policy_arch: '64-64' +qf_arch: '64-64' +policy_log_std_multiplier: 1.0 +policy_log_std_offset: -1.0 +n_epochs: 2001 +n_initial_env_steps: 1000 +n_env_steps_per_epoch: 1000 +n_train_step_per_epoch: 1000 +eval_period: 1 +eval_n_trajs: 5 +batch_size: 256 +discount: 0.99 +use_automatic_entropy_tuning: True +alpha_multiplier: 1.0 +backup_entropy: True +target_entropy: 0.0 +policy_lr: 3.0e-4 +qf_lr: 3.0e-4 +optimizer_type: 'adam' +soft_target_update_rate: 0.005 # 5e-3 +target_update_period: 1 + +# hyperparams for stabilization +activation_fn: 'relu' + +# for improved switched sac +use_automatic_entropy_tuning_parametrized_perturbation: True +expert_alpha_multiplier: 1.0 + +# Local experts variables +local_experts: + - 'SafeScripted' + +pos_tol_choices: + SAC: [ 0.5 ] + SAG: [ 0.5 ] + PIG: [ 0.5 ] + PAG: [ 0.5 ] + +beta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 0.5, 1, 2, 5 ] + PAG: [ 1.0 ] + +decay_parameter: + - True +delta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 1.0 ] + PAG: [ 0.5, 0.8, 0.9 ] + +phi: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 0.0 ] + PAG: [ 0.1, 0.3, 0.5, 1.0 ] + +# Ray variables +metric: 'mean_avg_return' +mode: 'max' + +# Save policy +num_epoch_save: 5000 diff --git a/RLLG/ray_config/point_mass_cfg.yaml b/RLLG/ray_config/point_mass_cfg.yaml index a1144843..c6d8833c 100644 --- a/RLLG/ray_config/point_mass_cfg.yaml +++ b/RLLG/ray_config/point_mass_cfg.yaml @@ -1,66 +1,66 @@ -# RL env and common variables -repeat_run: 5 -max_traj_length: 1000 -replay_buffer_size: 100000 -seed: 42 -network_arch: '64-64' -policy_arch: '64-64' -qf_arch: '64-64' -policy_log_std_multiplier: 1.0 -policy_log_std_offset: -1.0 -n_epochs: 1000 -n_initial_env_steps: 1000 -n_env_steps_per_epoch: 1000 -n_train_step_per_epoch: 1000 -eval_period: 1 -eval_n_trajs: 5 -batch_size: 256 -discount: 0.99 -use_automatic_entropy_tuning: True -alpha_multiplier: 1.0 -backup_entropy: True -target_entropy: 0.0 -policy_lr: 3.0e-4 -qf_lr: 3.0e-4 -optimizer_type: 'adam' -soft_target_update_rate: 0.005 # 5e-3 -target_update_period: 1 - -# hyperparams for stabilization -activation_fn: 'relu' - -# for improved switched sac -use_automatic_entropy_tuning_parametrized_perturbation: True -expert_alpha_multiplier: 1.0 - -# Local experts variables -local_experts: - - 'MediumSAC' - -beta: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 0.5, 1, 2, 5 ] - PAG: [ 0.0 ] - - -decay_parameter: - - False -delta: - SAC: [ 1.0 ] - SAG: [ 1.0 ] - PIG: [ 0.5, 0.8, 0.9 ] - PAG: [ 1.0 ] - -phi: - SAC: [ 0.0 ] - SAG: [ 0.0 ] - PIG: [ 0.0 ] - PAG: [ 0.5, 0.8, 1.0, 1.5 ] - -# Ray variables -metric: 'mean_avg_return' -mode: 'max' - -# Save policy -num_epoch_save: 10000 +# RL env and common variables +repeat_run: 5 +max_traj_length: 1000 +replay_buffer_size: 100000 +seed: 42 +network_arch: '64-64' +policy_arch: '64-64' +qf_arch: '64-64' +policy_log_std_multiplier: 1.0 +policy_log_std_offset: -1.0 +n_epochs: 1000 +n_initial_env_steps: 1000 +n_env_steps_per_epoch: 1000 +n_train_step_per_epoch: 1000 +eval_period: 1 +eval_n_trajs: 5 +batch_size: 256 +discount: 0.99 +use_automatic_entropy_tuning: True +alpha_multiplier: 1.0 +backup_entropy: True +target_entropy: 0.0 +policy_lr: 3.0e-4 +qf_lr: 3.0e-4 +optimizer_type: 'adam' +soft_target_update_rate: 0.005 # 5e-3 +target_update_period: 1 + +# hyperparams for stabilization +activation_fn: 'relu' + +# for improved switched sac +use_automatic_entropy_tuning_parametrized_perturbation: True +expert_alpha_multiplier: 1.0 + +# Local experts variables +local_experts: + - 'MediumSAC' + +beta: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 0.5, 1, 2, 5 ] + PAG: [ 0.0 ] + + +decay_parameter: + - False +delta: + SAC: [ 1.0 ] + SAG: [ 1.0 ] + PIG: [ 0.5, 0.8, 0.9 ] + PAG: [ 1.0 ] + +phi: + SAC: [ 0.0 ] + SAG: [ 0.0 ] + PIG: [ 0.0 ] + PAG: [ 0.5, 0.8, 1.0, 1.5 ] + +# Ray variables +metric: 'mean_avg_return' +mode: 'max' + +# Save policy +num_epoch_save: 10000 diff --git a/RLLG/requirements.txt b/RLLG/requirements.txt new file mode 100644 index 00000000..4725dc4d --- /dev/null +++ b/RLLG/requirements.txt @@ -0,0 +1,23 @@ +setuptools==65.5.0 +wheel==0.38.0 +numpy==1.23.1 +torch==1.10.2 +tensorboardX==2.4.1 +mujoco-py==2.1.2.14 +omegaconf==2.1.1 +protobuf==3.20.0 +# Install gym by hand. Works with traditional pip install -e . inside a conda env, +# but problem with docker otherwise. +gym==0.21.0 +ray[tune]==1.9.2 +pyyaml +matplotlib +ipython +pandas +matplotlib +jupyter +ml-collections +scipy +# Install dmc2gym by hand, with git clone git+https://github.com/denisyarats/dmc2gym.git and +# pip install -e . +# 'dmc2gym @ git+https://github.com/denisyarats/dmc2gym.git' \ No newline at end of file diff --git a/RLLG/sac_main_fn.py b/RLLG/sac_main_fn.py index 1bd8d790..03c8c29c 100644 --- a/RLLG/sac_main_fn.py +++ b/RLLG/sac_main_fn.py @@ -8,7 +8,7 @@ # PARTICULAR PURPOSE. See the MIT License for more details. - +from typing import Union, Any, Dict, List, Optional, Tuple from copy import deepcopy import torch from omegaconf import DictConfig @@ -17,27 +17,48 @@ import numpy as np # agents -from agents.algos.sac import SAC -from agents.algos.sag import SAG -from agents.algos.pag import PAG -from agents.algos.pig import PIG from agents.common.model import TanhGaussianPolicy, ParametrizedPerturbationTanhGaussianPolicy, FullyConnectedQFunction, \ SamplerPolicy, ExpertSamplerPolicy from agents.common.replay_buffer import ReplayBuffer, batch_to_torch from agents.common.sampler import StepSampler, TrajSampler from agents.common.utils import Timer, set_random_seed, prefix_metrics +from agents.common.creation_utils import create_envs, create_agent from envs.creation import get_env_and_control from envs.confidence import global_lambda_s -dict_agents = { - 'SAC': SAC, - 'SAG': SAG, - 'PIG': PIG, - 'PAG': PAG, -} -def save_all_models(qf1, qf2, target_qf1, target_qf2, policy, path): +def save_all_models(qf1: torch.nn.Module, + qf2: torch.nn.Module, + target_qf1: torch.nn.Module, + target_qf2: torch.nn.Module, + policy: torch.nn.Module, + path: Union[str, os.PathLike]) -> None: + """ + Save the state dictionaries of the different networks the agent uses to a specific path. + + Parameters: + ---------- + x : type + Description of parameter `x`. + qf1 : torch.nn.Module) + Critic 1 + qf2 : torch.nn.Module + Critic 2 + target_qf1 : torch.nn.Module + Target Critic 1 + target_qf2 :torch.nn.Module) + Target Critic 2 + policy : torch.nn.Module) + Policy + path : Union[str, os.PathLike] + The path where the model state dictionaries will be saved + + Returns: + ---------- + None + The function does not return anything. + """ torch.save(qf1.state_dict(), os.path.join(path, 'qf1')) torch.save(qf2.state_dict(), os.path.join(path, 'qf2')) torch.save(target_qf1.state_dict(), os.path.join(path, 'target_qf1')) @@ -45,7 +66,36 @@ def save_all_models(qf1, qf2, target_qf1, target_qf2, policy, path): torch.save(policy.state_dict(), os.path.join(path, 'policy')) -def load_all_models(qf1, qf2, target_qf1, target_qf2, policy, path): +def load_all_models(qf1: torch.nn.Module, + qf2: torch.nn.Module, + target_qf1: torch.nn.Module, + target_qf2: torch.nn.Module, + policy: torch.nn.Module, + path: Union[str, os.PathLike]) -> None: + """ + Load the state dictionaries of the different networks the agent uses from a specific path. + + Parameters: + ---------- + x : type + Description of parameter `x`. + qf1 : torch.nn.Module) + Critic 1 + qf2 : torch.nn.Module + Critic 2 + target_qf1 : torch.nn.Module + Target Critic 1 + target_qf2 :torch.nn.Module) + Target Critic 2 + policy : torch.nn.Module) + Policy + path : Union[str, os.PathLike] + The path where the model state dictionaries will be loaded. + + Returns: + ---------- + None : The function does not return anything. + """ qf1.load_state_dict(torch.load(os.path.join(path, 'qf1'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2'))) target_qf1.load_state_dict(torch.load(os.path.join(path, 'target_qf1'))) @@ -53,7 +103,20 @@ def load_all_models(qf1, qf2, target_qf1, target_qf2, policy, path): policy.load_state_dict(torch.load(os.path.join(path, 'policy'))) -def main(cfg): +def main(cfg: Dict) -> None: + """ + Main function to train an RL agent using Ray Tune. + + Parameters: + ---------- + cfg : Dict + The configuration dictionary + + Returns: + ---------- + None + The function runs the training process and reports metrics to Ray Tune. + """ cfg = DictConfig(cfg) # global hyperparameters @@ -61,47 +124,30 @@ def main(cfg): glob_name = cfg['glob_name'] num_run = cfg['repeat_run'] - # environment parameters - limit_cart = None - reward_end = None + # create envs and retrieve local controls + env_train, local_control_dict_train, env_test, local_control_dict_test = create_envs(cfg) + + # retrieve local experts and their confidence function + expert = cfg['expert'] pos_tol = None - speed_tol = None - if 'limit_cart' in cfg: - limit_cart = cfg['limit_cart'] - if 'reward_end' in cfg: - reward_end = cfg['reward_end'] if 'pos_tol' in cfg: pos_tol = cfg['pos_tol'] - env_train, local_control_dict_train = get_env_and_control(name=cfg['env'], - orig_cwd=cfg['orig_cwd'], - device=cfg['device'], - limit_cart=limit_cart, - reward_end=reward_end, - pos_tol=pos_tol - ) - env_test, local_control_dict_test = get_env_and_control(name=cfg['env'], - orig_cwd=cfg['orig_cwd'], - device=cfg['device'], - limit_cart=limit_cart, - reward_end=reward_end, - pos_tol=pos_tol - ) - - # experts - expert = cfg['expert'] lambda_s = global_lambda_s(cfg['glob_name'], expert, device=cfg['device'], - pos_tol=pos_tol, - speed_tol=speed_tol + pos_tol=pos_tol ) local_expert = local_control_dict_train[expert]['local_expert'] + # Create samplers train_sampler = StepSampler(env_train, cfg['max_traj_length']) # .unwrapped eval_sampler = TrajSampler(env_test, cfg['max_traj_length']) # .unwrapped + + # Create replay buffer replay_buffer = ReplayBuffer(cfg['replay_buffer_size']) set_random_seed(cfg["repeat_run"]) + # Create relevant networks (Critics, Target Critics, Perturbations, Policies) policy = TanhGaussianPolicy( eval_sampler.env.observation_space.shape[0], eval_sampler.env.action_space.shape[0], @@ -143,47 +189,18 @@ def main(cfg): cfg['target_entropy'] = -np.prod(eval_sampler.env.action_space.shape).item() # Get agent - if cfg['agent_name'] == 'SAC': - agent = dict_agents[agent_name](cfg, - policy, - sampler_policy, - qf1, - qf2, - target_qf1, - target_qf2) - elif cfg['agent_name'] == 'SAG': - agent = dict_agents[agent_name](cfg, - policy, - sampler_policy, - qf1, - qf2, - target_qf1, - target_qf2, - use_local=lambda_s, - local_expert=local_expert) - elif cfg['agent_name'] == 'PIG': - agent = dict_agents[agent_name](cfg, - policy, - sampler_policy, - qf1, - qf2, - target_qf1, - target_qf2, - use_local=lambda_s, - local_expert=local_expert, - beta=cfg['beta']) - else: - agent = dict_agents[agent_name](cfg, - policy, - sampler_policy, - qf1, - qf2, - target_qf1, - target_qf2, - use_local=lambda_s, - local_expert=local_expert, - parametrized_perturbation=parametrized_perturbation, - sampler_parametrized_perturbation=sampler_parametrized_perturbation) + agent = create_agent(cfg, + agent_name, + policy, + sampler_policy, + qf1, + qf2, + target_qf1, + target_qf2, + lambda_s, + local_expert, + parametrized_perturbation, + sampler_parametrized_perturbation) agent.torch_to_device(cfg['device']) # put beta right if PAG without decay parameter @@ -253,6 +270,7 @@ def main(cfg): if agent_name in ['PIG', 'PAG']: metrics[f'beta'] = agent.beta + # Report metrics to ray tune if epoch == 0 or (epoch + 1) % cfg['eval_period'] == 0 or epoch == cfg['n_epochs'] - 1: metrics['epoch'] = epoch metrics['rollout_time'] = rollout_timer() @@ -263,6 +281,7 @@ def main(cfg): # Report metrics tune.report(**metrics) + # Save agent policy if required if epoch % cfg['num_epoch_save'] == 0 and cfg['agent_name'] == 'SAC' and epoch > 0: act_fn = cfg['activation_fn'] save_path_init = os.path.join(cfg['orig_cwd'], diff --git a/RLLG/setup.py b/RLLG/setup.py index aeb2f3e5..7369c5b9 100644 --- a/RLLG/setup.py +++ b/RLLG/setup.py @@ -1,27 +1,35 @@ -# Created by Paul Daoudi -# Date: 11/02/2023 - -from setuptools import setup - -setup(author='Paul Daoudi', - name='rllg', - version='0.1.0', - install_requires=[ - 'setuptools', - 'numpy==1.23.1', - 'torch==1.10.2', - 'tensorboardX==2.4.1', - 'mujoco-py==2.1.2.14', - 'omegaconf==2.1.1', - 'gym==0.21.0', - 'ray[tune]', - 'pyyaml', - 'matplotlib', - 'ipython', - 'pandas', - 'matplotlib', - 'jupyter', - 'ml-collections', - 'scipy' - ] -) \ No newline at end of file +# Created by Paul Daoudi +# Date: 11/02/2023 + +from setuptools import setup, find_packages + +setup(author='Paul Daoudi', + name='rllg', + version='0.1.0', + packages=find_packages(include=['my_package', 'my_package.*']), + install_requires=[ + 'setuptools==65.5.0', + 'wheel==0.38.0', + 'numpy==1.23.1', + 'torch==1.10.2', + 'tensorboardX==2.4.1', + 'mujoco-py==2.1.2.14', + 'omegaconf==2.1.1', + 'protobuf==3.20.0', + # Install gym by hand. Works with traditional pip install -e . inside a conda env, + # but problem with docker otherwise. + # 'gym==0.21.0', + 'ray[tune]==1.9.2', + 'pyyaml', + 'matplotlib', + 'ipython', + 'pandas', + 'matplotlib', + 'jupyter', + 'ml-collections', + 'scipy', + # Install dmc2gym by hand, with git clone git+https://github.com/denisyarats/dmc2gym.git and + # pip install -e . + 'dmc2gym @ git+https://github.com/denisyarats/dmc2gym.git' + ] +)