This is source code for the paper
Fast Adaptation to New Environments via Policy-Dynamics Value Functions
by Roberta Raileanu, Max Goldstein, Arthur Szlam, and Rob Fergus,
accepted at ICML 2020.
If you use this code in your own work, please cite our paper:
@incollection{icml2020_3993,
abstract = {Standard RL algorithms assume fixed environment dynamics and require a significant amount of interaction to adapt to new environments. We introduce Policy-Dynamics Value Functions (PD-VF), a novel approach for rapidly adapting to dynamics different from those previously seen in training. PD-VF explicitly estimates the cumulative reward in a space of policies and environments. An ensemble of conventional RL policies is used to gather experience on training environments, from which embeddings of both policies and environments can be learned. Then, a value function conditioned on both embeddings is trained. At test time, a few actions are sufficient to infer the environment embedding, enabling a policy to be selected by maximizing the learned value function (which requires no additional environment interaction). We show that our method can rapidly adapt to new dynamics on a set of MuJoCo domains. },
author = {Raileanu, Roberta and Goldstein, Max and Szlam, Arthur and Rob Fergus, Facebook},
booktitle = {Proceedings of Machine Learning and Systems 2020},
pages = {7078--7089},
title = {Fast Adaptation to New Environments via Policy-Dynamics Value Functions},
year = {2020}
}
conda create -n pdvf python=3.7
conda activate pdvf
git clone [email protected]:rraileanu/policy-dynamics-value-functions.git
cd policy-dynamics-value-functions
pip install -r requirements.txt
cd myant
pip install -e .
cd myswimmer
pip install -e .
cd myspaceship
pip install -e .
Train PPO policies on each environments, one seed for each.
Each of the commands below need to be run for seed in [0,...,4] and for default-ind in [0,...,19].
python ppo/ppo_main.py \
--env-name spaceship-v0 --default-ind 0 --seed 0
python ppo/ppo_main.py \
--env-name myswimmer-v0 --default-ind 0 --seed 0
python ppo/ppo_main.py \
--env-name myant-v0 --default-ind 0 --seed 0
python train_dynamics_embedding.py \
--env-name spaceship-v0 \
--dynamics-embedding-dim 8 --dynamics-batch-size 8 \
--inf-num-steps 1 --num-dec-traj 10 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings
python train_dynamics_embedding.py \
--env-name myswimmer-v0 \
--dynamics-embedding-dim 2 --dynamics-batch-size 32 \
--inf-num-steps 1 --num-dec-traj 10 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings
python train_dynamics_embedding.py \
--env-name myant-v0 \
--dynamics-embedding-dim 8 --dynamics-batch-size 32 \
--inf-num-steps 2 --num-dec-traj 10 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings
python train_policy_embedding.py \
--env-name spaceship-v0 --num-dec-traj 1 \
--save-dir-policy-embedding ./models/policy-embeddings
python train_policy_embedding.py \
--env-name myswimmer-v0 --num-dec-traj 1 \
--save-dir-policy-embedding ./models/policy-embeddings
python train_policy_embedding.py \
--env-name myant-v0 --num-dec-traj 1 \
--save-dir-policy-embedding ./models/policy-embeddings
python train_pdvf.py \
--env-name spaceship-v0 \
--dynamics-batch-size 8 --policy-batch-size 2048 \
--dynamics-embedding-dim 8 --policy-embedding-dim 8 \
--num-dec-traj 10 --inf-num-steps 1 --log-interval 10 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings \
--save-dir-policy-embedding ./models/policy-embeddings \
--save-dir-pdvf ./models/pdvf-models
python train_pdvf.py \
--env-name myswimmer-v0 \
--dynamics-batch-size 8 --policy-batch-size 2048 \
--dynamics-embedding-dim 2 --policy-embedding-dim 8 \
--num-dec-traj 10 --inf-num-steps 1 --log-interval 10 \
--norm-reward --min-reward -60 --max-reward 200 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings \
--save-dir-policy-embedding ./models/policy-embeddings \
--save-dir-pdvf ./models/pdvf-models
python train_pdvf.py \
--env-name myant-v0 \
--dynamics-batch-size 32 --policy-batch-size 2048 \
--dynamics-embedding-dim 8 --policy-embedding-dim 8 \
--num-dec-traj 10 --inf-num-steps 2 --log-interval 10 \
--norm-reward --min-reward -400 --max-reward 1000 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings \
--save-dir-policy-embedding ./models/policy-embeddings \
--save-dir-pdvf ./models/pdvf-models
python eval_pdvf.py \
--env-name spaceship-v0 --stage 20 \
--dynamics-batch-size 8 --policy-batch-size 2048 \
--dynamics-embedding-dim 8 --policy-embedding-dim 8 \
--num-dec-traj 10 --inf-num-steps 1 --log-interval 10 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings \
--save-dir-policy-embedding ./models/policy-embeddings \
--save-dir-pdvf ./models/pdvf-models
python eval_pdvf.py \
--env-name myswimmer-v0 --stage 20 \
--dynamics-batch-size 8 --policy-batch-size 2048 \
--dynamics-embedding-dim 2 --policy-embedding-dim 8 \
--num-dec-traj 10 --inf-num-steps 1 --log-interval 10 \
--norm-reward --min-reward -60 --max-reward 200 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings \
--save-dir-policy-embedding ./models/policy-embeddings \
--save-dir-pdvf ./models/pdvf-models
python eval_pdvf.py \
--env-name myant-v0 --stage 20 \
--dynamics-batch-size 32 --policy-batch-size 2048 \
--dynamics-embedding-dim 8 --policy-embedding-dim 8 \
--num-dec-traj 10 --inf-num-steps 2 --log-interval 10 \
--norm-reward --min-reward -400 --max-reward 1000 \
--save-dir-dynamics-embedding ./models/dynamics-embeddings \
--save-dir-policy-embedding ./models/policy-embeddings \
--save-dir-pdvf ./models/pdvf-models