Add files via upload

yangminsi · Jun 17, 2019 · 0d25425 · 0d25425
1 parent db52529
commit 0d25425
Show file tree

Hide file tree

Showing 12 changed files with 755 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1 @@
-# target-distribution-learning
-source code for the paper "Policy Search by Target Distribution Learning for Continuous Control"
-
-The source code will be released upon the acceptance of the paper.
+## Source code for Target Distribution Learning (TDL)
diff --git a/TDL/__pycache__/policies.cpython-36.pyc b/TDL/__pycache__/policies.cpython-36.pyc
diff --git a/TDL/__pycache__/rollout.cpython-36.pyc b/TDL/__pycache__/rollout.cpython-36.pyc
diff --git a/TDL/__pycache__/tdl.cpython-36.pyc b/TDL/__pycache__/tdl.cpython-36.pyc
diff --git a/TDL/__pycache__/transition.cpython-36.pyc b/TDL/__pycache__/transition.cpython-36.pyc
diff --git a/TDL/__pycache__/utils.cpython-36.pyc b/TDL/__pycache__/utils.cpython-36.pyc
diff --git a/TDL/policies.py b/TDL/policies.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+
+class ActorCritic(nn.Module):
+    def __init__(self, dim_obs, dim_act, layer_norm=True, append_time=True, init_std=1.0):
+        super(ActorCritic, self).__init__()
+
+        self.append_time = append_time
+        self.action_dim = dim_act
+
+        self.actor_fc1 = nn.Linear(dim_obs, 64)
+        self.actor_fc2 = nn.Linear(64, 64)
+        self.actor_fc3 = nn.Linear(64, dim_act)
+        self.action_std = nn.Parameter(init_std * torch.ones(1, dim_act))
+
+        if self.append_time:
+            self.critic_fc1 = nn.Linear(dim_obs + 1, 64)
+        else:
+            self.critic_fc1 = nn.Linear(dim_obs, 64)
+        self.critic_fc2 = nn.Linear(64, 64)
+        self.critic_fc3 = nn.Linear(64, 1)
+
+        if layer_norm:
+            self.layer_norm(self.actor_fc1, std=1.0)
+            self.layer_norm(self.actor_fc2, std=1.0)
+            self.layer_norm(self.actor_fc3, std=0.01)
+
+            self.layer_norm(self.critic_fc1, std=1.0)
+            self.layer_norm(self.critic_fc2, std=1.0)
+            self.layer_norm(self.critic_fc3, std=1.0)
+
+    @staticmethod
+    def layer_norm(layer, std=1.0, bias_const=0.0):
+        torch.nn.init.orthogonal_(layer.weight, std)
+        torch.nn.init.constant_(layer.bias, bias_const)
+
+    def forward(self, states):
+        """
+        run policy network (actor) as well as value network (critic)
+        """
+        if self.append_time:
+            action_mean = self._forward_actor(states[:, :-1])
+        else:
+            action_mean = self._forward_actor(states)
+        critic_value = self._forward_critic(states)
+        return action_mean, self.action_std, critic_value
+
+    def _forward_actor(self, states):
+        x = torch.tanh(self.actor_fc1(states))
+        x = torch.tanh(self.actor_fc2(x))
+        action_mean = self.actor_fc3(x)
+        return action_mean
+
+    def _forward_critic(self, states):
+        x = torch.tanh(self.critic_fc1(states))
+        x = torch.tanh(self.critic_fc2(x))
+        critic_value = self.critic_fc3(x)
+        return critic_value
+
+    def select_action(self, action_mean, action_std):
+        y = torch.normal(torch.zeros(self.action_dim), torch.ones(self.action_dim))
+        action = action_mean + y * action_std
+        return action, y
+
diff --git a/TDL/rollout.py b/TDL/rollout.py
@@ -0,0 +1,130 @@
+from utils import Memory
+import torch
+from torch import Tensor
+from torch.distributions.normal import Normal
+from os.path import join as joindir
+import numpy as np
+
+
+class Rollout(object):
+    def __init__(self):
+        self.memory = None
+
+    def rollout_render(self, env, network, args, running_state, render, video_folder):
+        counter = 0
+        state = env.reset()
+        if args.state_norm:
+            state = running_state(state)
+        render.render()
+        while counter < 5:
+            action_mean = network._forward_actor(Tensor(state).unsqueeze(0))
+            action = action_mean.data.numpy()[0]
+            next_state, _, done, _ = env.step(action)
+            next_state = running_state(next_state)
+            render.render()
+            if done:
+                counter += 1
+                state = env.reset()
+                if args.state_norm:
+                    state = running_state(state)
+                render.render()
+            state = next_state
+        render.to_mp4(joindir(video_folder, '{}-{}.mp4'.format(args.label, args.seed)))
+
+    def rollout_train(self, env, network, args, running_state, max_episode_steps):
+        return self._rollout_with_memory(env, network, args, running_state, max_episode_steps, keep_memory=False)
+
+    def rollout_validate_KL(self, env, network, args, running_state, max_episode_steps):
+        return self._rollout_with_memory(env, network, args, running_state, max_episode_steps, keep_memory=True)
+
+    def rollout_validate(self, env, network, args, running_state, max_episode_steps):
+        return self._rollout_no_memory(env, network, args, running_state, max_episode_steps)
+
+    def _rollout_with_memory(self, env, network, args, running_state, max_episode_steps, keep_memory=False):
+        memory = Memory()
+        num_steps = 0
+        reward_list = []
+        len_list = []
+        while num_steps < args.batch_size:
+            state = env.reset()
+            if args.state_norm:
+                state = running_state(state)
+            if args.append_time:
+                state = np.append(state, 1.0)
+            reward_sum = 0
+            for t in range(max_episode_steps):
+                action_mean, action_std, value = network(Tensor(state).unsqueeze(0))
+                action_mean = action_mean[0]
+                action_std = action_std[0]
+                action, y = network.select_action(action_mean, action_std)
+                action_mean = action_mean.data.numpy()
+                action = action.data.numpy()
+                y = y.data.numpy()
+                next_state, reward, done, info = env.step(action)
+                reward_sum += reward
+                if args.state_norm:
+                    next_state = running_state(next_state)
+                if args.append_time:
+                    next_state = np.append(next_state, 1 - (t + 1) / max_episode_steps)
+                mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1
+                memory.push(state, value, action_mean, action, y, mask, next_state, reward)
+
+                if done:
+                    break
+
+                state = next_state
+
+            num_steps += (t + 1)
+            reward_list.append(reward_sum)
+            len_list.append(t + 1)
+
+            meanepreward = np.mean(reward_list)
+            meaneplen = np.mean(len_list)
+
+        if keep_memory:
+            self.memory = memory
+            self.old_std = network.action_std.data
+            return meanepreward, meaneplen
+        else:
+            return memory, meanepreward, meaneplen, num_steps
+
+    def _rollout_no_memory(self, env, network, args, running_state, max_episode_steps):
+        num_steps = 0
+        reward_list = []
+        len_list = []
+        while num_steps < args.batch_size:
+            state = env.reset()
+            if args.state_norm:
+                state = running_state(state)
+            reward_sum = 0
+            for t in range(max_episode_steps):
+                action_mean = network._forward_actor(Tensor(state).unsqueeze(0))
+                action = action_mean.data.numpy()[0]
+                next_state, reward, done, _ = env.step(action)
+                reward_sum += reward
+                if args.state_norm:
+                    next_state = running_state(next_state)
+                if done:
+                    break
+                state = next_state
+            num_steps += (t + 1)
+            reward_list.append(reward_sum)
+            len_list.append(t + 1)
+        meanepreward_val = np.mean(reward_list)
+        meaneplen_val = np.mean(len_list)
+        return meanepreward_val, meaneplen_val
+
+    def calculate_KL(self, network):
+        old_std = self.old_std
+        new_std = network.action_std.data
+
+        states, _, old_mu, _, _, _, _, _ = self.memory.tsample()
+        new_mu = network._forward_actor(states[:, :-1])
+
+        d1 = Normal(old_mu, old_std)
+        d2 = Normal(new_mu, new_std)
+        kl = torch.distributions.kl.kl_divergence(d1, d2)
+        kls = np.linalg.norm(kl.data.numpy(), axis=1)
+
+        return kls.mean(), kls.max()
+
diff --git a/TDL/run.py b/TDL/run.py
@@ -0,0 +1,101 @@
+from tdl import runner
+from utils import dict_to_object
+from itertools import product
+import argparse
+import pdb
+
+args = {
+    'env_name': 'Hopper-v2',
+    'seed': 6666,
+    'num_episode': 2000,
+    'batch_size': 2048,
+    'gamma': 0.995,
+    'lamda': 0.97,
+    'log_num_episode': 1,
+    'val_num_episode': 10,
+    'num_epoch': 60,
+    'minibatch_size': 256,
+    'loss_coeff_value': 0.5,
+    'loss_coeff_entropy': 0.0,
+    'lr': 1e-4,
+    'num_parallel_run': 5,
+    'use_cuda': True,
+    'record_KL': True,
+    # tricks
+    'layer_norm': True,
+    'state_norm': True,
+    'lossvalue_norm': True,
+    'advantage_norm': False,
+    'append_time': True,
+    'schedule_adam': 'linear',
+    # experiments
+    'label': 'myalgo',
+    'method': 'direct',
+    'init_std': 0.3,                    # hyperparameter for all methods
+    'step_size': 1.0,                   # hyperparameter for ``method=ES, ES-MA1, ES-MA2``
+    'schedule_stepsize': 'constant',
+    'y2_max': 0.05,                     # hyperparameter for ``method=direct``
+    'schedule_y2max': 'constant',
+    'n_points': 2,                      # hyperparameter for ``method=ES-MA1, ES-MA2``
+    'mean_ratio': 0.1,                  # hyperparameter for ``method=ES-MA1``
+    'schedule_meanratio': 'constant',
+    'beta': 0.1,                        # hyperparameter for ``method=ES-MA2``
+}
+
+def test(args, label):
+    record_dfs = []
+    for i in range(args.num_parallel_run):
+        model = runner(args)
+        args.seed += 1
+
+def train(term_args):
+    myalgo_args = args.copy()
+    method = 'direct'
+    for init_std, y2_max in product([0.3, 1.0], [0.025, 0.050, 0.100]):
+        myalgo_args.update({
+            'label': term_args.label,
+            'env_name': term_args.env, 
+            'method': method,
+            'init_std': init_std, 
+            'y2_max': y2_max,
+        })
+    myalgo_args = dict_to_object(myalgo_args)
+    test(myalgo_args, term_args.label)
+
+    myalgo_args = args.copy()
+    method = 'ES'
+    for init_std, step_size in product([0.3, 1.0], [0.05, 0.10, 0.50, 1.00]):
+        myalgo_args.update({
+            'label': term_args.label,
+            'env_name': term_args.env, 
+            'method': method,
+            'init_std': init_std,
+            'step_size': y2_max,
+        })
+    myalgo_args = dict_to_object(myalgo_args)
+    test(myalgo_args, term_args.label)
+
+    myalgo_args = args.copy()
+    method = 'ES-MA1'
+    for init_std, step_size, mean_ratio, n_points in product([0.3, 1.0], [0.05, 0.10, 0.50, 1.00], [0.1, 1.0], [2, 5]):
+        myalgo_args.update({
+            'label': term_args.label,
+            'env_name': term_args.env, 
+            'method': method,
+            'init_std': init_std,
+            'step_size': y2_max,
+            'mean_ratio': mean_ratio,
+            'n_points': n_points,
+        })
+    myalgo_args = dict_to_object(myalgo_args)
+    test(myalgo_args, term_args.label)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--env', type=str, default='InvertedPendulum-v2')
+    parser.add_argument('--label', type=str, default='default')
+    term_args = parser.parse_args()
+    train(term_args)
+
+if __name__ == '__main__':
+    main()