Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
targetdistributionlearning authored Jun 17, 2019
1 parent db52529 commit 0d25425
Show file tree
Hide file tree
Showing 12 changed files with 755 additions and 4 deletions.
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
# target-distribution-learning
source code for the paper "Policy Search by Target Distribution Learning for Continuous Control"

The source code will be released upon the acceptance of the paper.
## Source code for Target Distribution Learning (TDL)
Binary file added TDL/__pycache__/policies.cpython-36.pyc
Binary file not shown.
Binary file added TDL/__pycache__/rollout.cpython-36.pyc
Binary file not shown.
Binary file added TDL/__pycache__/tdl.cpython-36.pyc
Binary file not shown.
Binary file added TDL/__pycache__/transition.cpython-36.pyc
Binary file not shown.
Binary file added TDL/__pycache__/utils.cpython-36.pyc
Binary file not shown.
64 changes: 64 additions & 0 deletions TDL/policies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import torch
import torch.nn as nn

class ActorCritic(nn.Module):
def __init__(self, dim_obs, dim_act, layer_norm=True, append_time=True, init_std=1.0):
super(ActorCritic, self).__init__()

self.append_time = append_time
self.action_dim = dim_act

self.actor_fc1 = nn.Linear(dim_obs, 64)
self.actor_fc2 = nn.Linear(64, 64)
self.actor_fc3 = nn.Linear(64, dim_act)
self.action_std = nn.Parameter(init_std * torch.ones(1, dim_act))

if self.append_time:
self.critic_fc1 = nn.Linear(dim_obs + 1, 64)
else:
self.critic_fc1 = nn.Linear(dim_obs, 64)
self.critic_fc2 = nn.Linear(64, 64)
self.critic_fc3 = nn.Linear(64, 1)

if layer_norm:
self.layer_norm(self.actor_fc1, std=1.0)
self.layer_norm(self.actor_fc2, std=1.0)
self.layer_norm(self.actor_fc3, std=0.01)

self.layer_norm(self.critic_fc1, std=1.0)
self.layer_norm(self.critic_fc2, std=1.0)
self.layer_norm(self.critic_fc3, std=1.0)

@staticmethod
def layer_norm(layer, std=1.0, bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)

def forward(self, states):
"""
run policy network (actor) as well as value network (critic)
"""
if self.append_time:
action_mean = self._forward_actor(states[:, :-1])
else:
action_mean = self._forward_actor(states)
critic_value = self._forward_critic(states)
return action_mean, self.action_std, critic_value

def _forward_actor(self, states):
x = torch.tanh(self.actor_fc1(states))
x = torch.tanh(self.actor_fc2(x))
action_mean = self.actor_fc3(x)
return action_mean

def _forward_critic(self, states):
x = torch.tanh(self.critic_fc1(states))
x = torch.tanh(self.critic_fc2(x))
critic_value = self.critic_fc3(x)
return critic_value

def select_action(self, action_mean, action_std):
y = torch.normal(torch.zeros(self.action_dim), torch.ones(self.action_dim))
action = action_mean + y * action_std
return action, y

130 changes: 130 additions & 0 deletions TDL/rollout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from utils import Memory
import torch
from torch import Tensor
from torch.distributions.normal import Normal
from os.path import join as joindir
import numpy as np


class Rollout(object):
def __init__(self):
self.memory = None

def rollout_render(self, env, network, args, running_state, render, video_folder):
counter = 0
state = env.reset()
if args.state_norm:
state = running_state(state)
render.render()
while counter < 5:
action_mean = network._forward_actor(Tensor(state).unsqueeze(0))
action = action_mean.data.numpy()[0]
next_state, _, done, _ = env.step(action)
next_state = running_state(next_state)
render.render()
if done:
counter += 1
state = env.reset()
if args.state_norm:
state = running_state(state)
render.render()
state = next_state
render.to_mp4(joindir(video_folder, '{}-{}.mp4'.format(args.label, args.seed)))

def rollout_train(self, env, network, args, running_state, max_episode_steps):
return self._rollout_with_memory(env, network, args, running_state, max_episode_steps, keep_memory=False)

def rollout_validate_KL(self, env, network, args, running_state, max_episode_steps):
return self._rollout_with_memory(env, network, args, running_state, max_episode_steps, keep_memory=True)

def rollout_validate(self, env, network, args, running_state, max_episode_steps):
return self._rollout_no_memory(env, network, args, running_state, max_episode_steps)

def _rollout_with_memory(self, env, network, args, running_state, max_episode_steps, keep_memory=False):
memory = Memory()
num_steps = 0
reward_list = []
len_list = []
while num_steps < args.batch_size:
state = env.reset()
if args.state_norm:
state = running_state(state)
if args.append_time:
state = np.append(state, 1.0)
reward_sum = 0
for t in range(max_episode_steps):
action_mean, action_std, value = network(Tensor(state).unsqueeze(0))
action_mean = action_mean[0]
action_std = action_std[0]
action, y = network.select_action(action_mean, action_std)
action_mean = action_mean.data.numpy()
action = action.data.numpy()
y = y.data.numpy()
next_state, reward, done, info = env.step(action)
reward_sum += reward
if args.state_norm:
next_state = running_state(next_state)
if args.append_time:
next_state = np.append(next_state, 1 - (t + 1) / max_episode_steps)
mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1
memory.push(state, value, action_mean, action, y, mask, next_state, reward)

if done:
break

state = next_state

num_steps += (t + 1)
reward_list.append(reward_sum)
len_list.append(t + 1)

meanepreward = np.mean(reward_list)
meaneplen = np.mean(len_list)

if keep_memory:
self.memory = memory
self.old_std = network.action_std.data
return meanepreward, meaneplen
else:
return memory, meanepreward, meaneplen, num_steps

def _rollout_no_memory(self, env, network, args, running_state, max_episode_steps):
num_steps = 0
reward_list = []
len_list = []
while num_steps < args.batch_size:
state = env.reset()
if args.state_norm:
state = running_state(state)
reward_sum = 0
for t in range(max_episode_steps):
action_mean = network._forward_actor(Tensor(state).unsqueeze(0))
action = action_mean.data.numpy()[0]
next_state, reward, done, _ = env.step(action)
reward_sum += reward
if args.state_norm:
next_state = running_state(next_state)
if done:
break
state = next_state
num_steps += (t + 1)
reward_list.append(reward_sum)
len_list.append(t + 1)
meanepreward_val = np.mean(reward_list)
meaneplen_val = np.mean(len_list)
return meanepreward_val, meaneplen_val

def calculate_KL(self, network):
old_std = self.old_std
new_std = network.action_std.data

states, _, old_mu, _, _, _, _, _ = self.memory.tsample()
new_mu = network._forward_actor(states[:, :-1])

d1 = Normal(old_mu, old_std)
d2 = Normal(new_mu, new_std)
kl = torch.distributions.kl.kl_divergence(d1, d2)
kls = np.linalg.norm(kl.data.numpy(), axis=1)

return kls.mean(), kls.max()

101 changes: 101 additions & 0 deletions TDL/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from tdl import runner
from utils import dict_to_object
from itertools import product
import argparse
import pdb

args = {
'env_name': 'Hopper-v2',
'seed': 6666,
'num_episode': 2000,
'batch_size': 2048,
'gamma': 0.995,
'lamda': 0.97,
'log_num_episode': 1,
'val_num_episode': 10,
'num_epoch': 60,
'minibatch_size': 256,
'loss_coeff_value': 0.5,
'loss_coeff_entropy': 0.0,
'lr': 1e-4,
'num_parallel_run': 5,
'use_cuda': True,
'record_KL': True,
# tricks
'layer_norm': True,
'state_norm': True,
'lossvalue_norm': True,
'advantage_norm': False,
'append_time': True,
'schedule_adam': 'linear',
# experiments
'label': 'myalgo',
'method': 'direct',
'init_std': 0.3, # hyperparameter for all methods
'step_size': 1.0, # hyperparameter for ``method=ES, ES-MA1, ES-MA2``
'schedule_stepsize': 'constant',
'y2_max': 0.05, # hyperparameter for ``method=direct``
'schedule_y2max': 'constant',
'n_points': 2, # hyperparameter for ``method=ES-MA1, ES-MA2``
'mean_ratio': 0.1, # hyperparameter for ``method=ES-MA1``
'schedule_meanratio': 'constant',
'beta': 0.1, # hyperparameter for ``method=ES-MA2``
}

def test(args, label):
record_dfs = []
for i in range(args.num_parallel_run):
model = runner(args)
args.seed += 1

def train(term_args):
myalgo_args = args.copy()
method = 'direct'
for init_std, y2_max in product([0.3, 1.0], [0.025, 0.050, 0.100]):
myalgo_args.update({
'label': term_args.label,
'env_name': term_args.env,
'method': method,
'init_std': init_std,
'y2_max': y2_max,
})
myalgo_args = dict_to_object(myalgo_args)
test(myalgo_args, term_args.label)

myalgo_args = args.copy()
method = 'ES'
for init_std, step_size in product([0.3, 1.0], [0.05, 0.10, 0.50, 1.00]):
myalgo_args.update({
'label': term_args.label,
'env_name': term_args.env,
'method': method,
'init_std': init_std,
'step_size': y2_max,
})
myalgo_args = dict_to_object(myalgo_args)
test(myalgo_args, term_args.label)

myalgo_args = args.copy()
method = 'ES-MA1'
for init_std, step_size, mean_ratio, n_points in product([0.3, 1.0], [0.05, 0.10, 0.50, 1.00], [0.1, 1.0], [2, 5]):
myalgo_args.update({
'label': term_args.label,
'env_name': term_args.env,
'method': method,
'init_std': init_std,
'step_size': y2_max,
'mean_ratio': mean_ratio,
'n_points': n_points,
})
myalgo_args = dict_to_object(myalgo_args)
test(myalgo_args, term_args.label)

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--env', type=str, default='InvertedPendulum-v2')
parser.add_argument('--label', type=str, default='default')
term_args = parser.parse_args()
train(term_args)

if __name__ == '__main__':
main()
Loading

0 comments on commit 0d25425

Please sign in to comment.