-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
db52529
commit 0d25425
Showing
12 changed files
with
755 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1 @@ | ||
# target-distribution-learning | ||
source code for the paper "Policy Search by Target Distribution Learning for Continuous Control" | ||
|
||
The source code will be released upon the acceptance of the paper. | ||
## Source code for Target Distribution Learning (TDL) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import torch | ||
import torch.nn as nn | ||
|
||
class ActorCritic(nn.Module): | ||
def __init__(self, dim_obs, dim_act, layer_norm=True, append_time=True, init_std=1.0): | ||
super(ActorCritic, self).__init__() | ||
|
||
self.append_time = append_time | ||
self.action_dim = dim_act | ||
|
||
self.actor_fc1 = nn.Linear(dim_obs, 64) | ||
self.actor_fc2 = nn.Linear(64, 64) | ||
self.actor_fc3 = nn.Linear(64, dim_act) | ||
self.action_std = nn.Parameter(init_std * torch.ones(1, dim_act)) | ||
|
||
if self.append_time: | ||
self.critic_fc1 = nn.Linear(dim_obs + 1, 64) | ||
else: | ||
self.critic_fc1 = nn.Linear(dim_obs, 64) | ||
self.critic_fc2 = nn.Linear(64, 64) | ||
self.critic_fc3 = nn.Linear(64, 1) | ||
|
||
if layer_norm: | ||
self.layer_norm(self.actor_fc1, std=1.0) | ||
self.layer_norm(self.actor_fc2, std=1.0) | ||
self.layer_norm(self.actor_fc3, std=0.01) | ||
|
||
self.layer_norm(self.critic_fc1, std=1.0) | ||
self.layer_norm(self.critic_fc2, std=1.0) | ||
self.layer_norm(self.critic_fc3, std=1.0) | ||
|
||
@staticmethod | ||
def layer_norm(layer, std=1.0, bias_const=0.0): | ||
torch.nn.init.orthogonal_(layer.weight, std) | ||
torch.nn.init.constant_(layer.bias, bias_const) | ||
|
||
def forward(self, states): | ||
""" | ||
run policy network (actor) as well as value network (critic) | ||
""" | ||
if self.append_time: | ||
action_mean = self._forward_actor(states[:, :-1]) | ||
else: | ||
action_mean = self._forward_actor(states) | ||
critic_value = self._forward_critic(states) | ||
return action_mean, self.action_std, critic_value | ||
|
||
def _forward_actor(self, states): | ||
x = torch.tanh(self.actor_fc1(states)) | ||
x = torch.tanh(self.actor_fc2(x)) | ||
action_mean = self.actor_fc3(x) | ||
return action_mean | ||
|
||
def _forward_critic(self, states): | ||
x = torch.tanh(self.critic_fc1(states)) | ||
x = torch.tanh(self.critic_fc2(x)) | ||
critic_value = self.critic_fc3(x) | ||
return critic_value | ||
|
||
def select_action(self, action_mean, action_std): | ||
y = torch.normal(torch.zeros(self.action_dim), torch.ones(self.action_dim)) | ||
action = action_mean + y * action_std | ||
return action, y | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
from utils import Memory | ||
import torch | ||
from torch import Tensor | ||
from torch.distributions.normal import Normal | ||
from os.path import join as joindir | ||
import numpy as np | ||
|
||
|
||
class Rollout(object): | ||
def __init__(self): | ||
self.memory = None | ||
|
||
def rollout_render(self, env, network, args, running_state, render, video_folder): | ||
counter = 0 | ||
state = env.reset() | ||
if args.state_norm: | ||
state = running_state(state) | ||
render.render() | ||
while counter < 5: | ||
action_mean = network._forward_actor(Tensor(state).unsqueeze(0)) | ||
action = action_mean.data.numpy()[0] | ||
next_state, _, done, _ = env.step(action) | ||
next_state = running_state(next_state) | ||
render.render() | ||
if done: | ||
counter += 1 | ||
state = env.reset() | ||
if args.state_norm: | ||
state = running_state(state) | ||
render.render() | ||
state = next_state | ||
render.to_mp4(joindir(video_folder, '{}-{}.mp4'.format(args.label, args.seed))) | ||
|
||
def rollout_train(self, env, network, args, running_state, max_episode_steps): | ||
return self._rollout_with_memory(env, network, args, running_state, max_episode_steps, keep_memory=False) | ||
|
||
def rollout_validate_KL(self, env, network, args, running_state, max_episode_steps): | ||
return self._rollout_with_memory(env, network, args, running_state, max_episode_steps, keep_memory=True) | ||
|
||
def rollout_validate(self, env, network, args, running_state, max_episode_steps): | ||
return self._rollout_no_memory(env, network, args, running_state, max_episode_steps) | ||
|
||
def _rollout_with_memory(self, env, network, args, running_state, max_episode_steps, keep_memory=False): | ||
memory = Memory() | ||
num_steps = 0 | ||
reward_list = [] | ||
len_list = [] | ||
while num_steps < args.batch_size: | ||
state = env.reset() | ||
if args.state_norm: | ||
state = running_state(state) | ||
if args.append_time: | ||
state = np.append(state, 1.0) | ||
reward_sum = 0 | ||
for t in range(max_episode_steps): | ||
action_mean, action_std, value = network(Tensor(state).unsqueeze(0)) | ||
action_mean = action_mean[0] | ||
action_std = action_std[0] | ||
action, y = network.select_action(action_mean, action_std) | ||
action_mean = action_mean.data.numpy() | ||
action = action.data.numpy() | ||
y = y.data.numpy() | ||
next_state, reward, done, info = env.step(action) | ||
reward_sum += reward | ||
if args.state_norm: | ||
next_state = running_state(next_state) | ||
if args.append_time: | ||
next_state = np.append(next_state, 1 - (t + 1) / max_episode_steps) | ||
mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1 | ||
memory.push(state, value, action_mean, action, y, mask, next_state, reward) | ||
|
||
if done: | ||
break | ||
|
||
state = next_state | ||
|
||
num_steps += (t + 1) | ||
reward_list.append(reward_sum) | ||
len_list.append(t + 1) | ||
|
||
meanepreward = np.mean(reward_list) | ||
meaneplen = np.mean(len_list) | ||
|
||
if keep_memory: | ||
self.memory = memory | ||
self.old_std = network.action_std.data | ||
return meanepreward, meaneplen | ||
else: | ||
return memory, meanepreward, meaneplen, num_steps | ||
|
||
def _rollout_no_memory(self, env, network, args, running_state, max_episode_steps): | ||
num_steps = 0 | ||
reward_list = [] | ||
len_list = [] | ||
while num_steps < args.batch_size: | ||
state = env.reset() | ||
if args.state_norm: | ||
state = running_state(state) | ||
reward_sum = 0 | ||
for t in range(max_episode_steps): | ||
action_mean = network._forward_actor(Tensor(state).unsqueeze(0)) | ||
action = action_mean.data.numpy()[0] | ||
next_state, reward, done, _ = env.step(action) | ||
reward_sum += reward | ||
if args.state_norm: | ||
next_state = running_state(next_state) | ||
if done: | ||
break | ||
state = next_state | ||
num_steps += (t + 1) | ||
reward_list.append(reward_sum) | ||
len_list.append(t + 1) | ||
meanepreward_val = np.mean(reward_list) | ||
meaneplen_val = np.mean(len_list) | ||
return meanepreward_val, meaneplen_val | ||
|
||
def calculate_KL(self, network): | ||
old_std = self.old_std | ||
new_std = network.action_std.data | ||
|
||
states, _, old_mu, _, _, _, _, _ = self.memory.tsample() | ||
new_mu = network._forward_actor(states[:, :-1]) | ||
|
||
d1 = Normal(old_mu, old_std) | ||
d2 = Normal(new_mu, new_std) | ||
kl = torch.distributions.kl.kl_divergence(d1, d2) | ||
kls = np.linalg.norm(kl.data.numpy(), axis=1) | ||
|
||
return kls.mean(), kls.max() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
from tdl import runner | ||
from utils import dict_to_object | ||
from itertools import product | ||
import argparse | ||
import pdb | ||
|
||
args = { | ||
'env_name': 'Hopper-v2', | ||
'seed': 6666, | ||
'num_episode': 2000, | ||
'batch_size': 2048, | ||
'gamma': 0.995, | ||
'lamda': 0.97, | ||
'log_num_episode': 1, | ||
'val_num_episode': 10, | ||
'num_epoch': 60, | ||
'minibatch_size': 256, | ||
'loss_coeff_value': 0.5, | ||
'loss_coeff_entropy': 0.0, | ||
'lr': 1e-4, | ||
'num_parallel_run': 5, | ||
'use_cuda': True, | ||
'record_KL': True, | ||
# tricks | ||
'layer_norm': True, | ||
'state_norm': True, | ||
'lossvalue_norm': True, | ||
'advantage_norm': False, | ||
'append_time': True, | ||
'schedule_adam': 'linear', | ||
# experiments | ||
'label': 'myalgo', | ||
'method': 'direct', | ||
'init_std': 0.3, # hyperparameter for all methods | ||
'step_size': 1.0, # hyperparameter for ``method=ES, ES-MA1, ES-MA2`` | ||
'schedule_stepsize': 'constant', | ||
'y2_max': 0.05, # hyperparameter for ``method=direct`` | ||
'schedule_y2max': 'constant', | ||
'n_points': 2, # hyperparameter for ``method=ES-MA1, ES-MA2`` | ||
'mean_ratio': 0.1, # hyperparameter for ``method=ES-MA1`` | ||
'schedule_meanratio': 'constant', | ||
'beta': 0.1, # hyperparameter for ``method=ES-MA2`` | ||
} | ||
|
||
def test(args, label): | ||
record_dfs = [] | ||
for i in range(args.num_parallel_run): | ||
model = runner(args) | ||
args.seed += 1 | ||
|
||
def train(term_args): | ||
myalgo_args = args.copy() | ||
method = 'direct' | ||
for init_std, y2_max in product([0.3, 1.0], [0.025, 0.050, 0.100]): | ||
myalgo_args.update({ | ||
'label': term_args.label, | ||
'env_name': term_args.env, | ||
'method': method, | ||
'init_std': init_std, | ||
'y2_max': y2_max, | ||
}) | ||
myalgo_args = dict_to_object(myalgo_args) | ||
test(myalgo_args, term_args.label) | ||
|
||
myalgo_args = args.copy() | ||
method = 'ES' | ||
for init_std, step_size in product([0.3, 1.0], [0.05, 0.10, 0.50, 1.00]): | ||
myalgo_args.update({ | ||
'label': term_args.label, | ||
'env_name': term_args.env, | ||
'method': method, | ||
'init_std': init_std, | ||
'step_size': y2_max, | ||
}) | ||
myalgo_args = dict_to_object(myalgo_args) | ||
test(myalgo_args, term_args.label) | ||
|
||
myalgo_args = args.copy() | ||
method = 'ES-MA1' | ||
for init_std, step_size, mean_ratio, n_points in product([0.3, 1.0], [0.05, 0.10, 0.50, 1.00], [0.1, 1.0], [2, 5]): | ||
myalgo_args.update({ | ||
'label': term_args.label, | ||
'env_name': term_args.env, | ||
'method': method, | ||
'init_std': init_std, | ||
'step_size': y2_max, | ||
'mean_ratio': mean_ratio, | ||
'n_points': n_points, | ||
}) | ||
myalgo_args = dict_to_object(myalgo_args) | ||
test(myalgo_args, term_args.label) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--env', type=str, default='InvertedPendulum-v2') | ||
parser.add_argument('--label', type=str, default='default') | ||
term_args = parser.parse_args() | ||
train(term_args) | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.