Skip to content

Commit

Permalink
* bias correction & 1st order trpo
Browse files Browse the repository at this point in the history
  • Loading branch information
jingweiz committed Aug 20, 2017
1 parent 4e6abab commit ebc040f
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 42 deletions.
7 changes: 6 additions & 1 deletion core/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,19 @@ def __init__(self, args, env_prototype, model_prototype, memory_prototype=None):
self.tau = args.tau
self.beta = args.beta
elif args.agent_type == "acer":
self.enable_bias_correction = args.enable_bias_correction
self.enable_1st_order_trpo = args.enable_1st_order_trpo

self.enable_lstm = args.enable_lstm
self.enable_continuous = args.enable_continuous
self.num_processes = args.num_processes

self.replay_ratio = args.replay_ratio
self.learn_start = args.learn_start
self.replay_start = args.replay_start
self.batch_size = args.batch_size
self.valid_size = args.valid_size
self.clip_trace = args.clip_trace
self.clip_1st_order_trpo = args.clip_1st_order_trpo

self.rollout_steps = args.rollout_steps
self.tau = args.tau
Expand Down
5 changes: 1 addition & 4 deletions core/agents/a3c_single_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,6 @@ def __init__(self, master, process_id=0):
master.logger.warning("<===================================> A3C-Learner #" + str(process_id) + " {Env & Model}")
super(A3CLearner, self).__init__(master, process_id)

# learning algorithm # TODO: adjust learning to each process maybe ???
self.optimizer = self.master.optim(self.model.parameters(), lr = self.master.lr, weight_decay=self.master.weight_decay)

self._reset_rollout()

self.training = True # choose actions by polinomial
Expand Down Expand Up @@ -251,7 +248,7 @@ def run(self):
while self.master.train_step.value < self.master.steps:
# sync in every step
self._sync_local_with_global()
self.optimizer.zero_grad()
self.model.zero_grad()

# start of a new episode
if should_start_new:
Expand Down
123 changes: 92 additions & 31 deletions core/agents/acer_single_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import time
import math
import torch
from torch.autograd import Variable
from torch.autograd import Variable, grad, backward
import torch.nn.functional as F

from utils.helpers import ACER_Experience
Expand Down Expand Up @@ -111,9 +111,6 @@ def __init__(self, master, process_id=0):
self.memory = self.master.memory_prototype(capacity = self.master.memory_params.memory_size // self.master.num_processes,
max_episode_length = self.master.early_stop)

# learning algorithm # TODO: adjust learning to each process maybe ???
self.optimizer = self.master.optim(self.model.parameters(), lr = self.master.lr)

self._reset_rollout()

self.training = True # choose actions by polinomial
Expand Down Expand Up @@ -148,38 +145,102 @@ def _reset_rollout(self): # for storing the experiences collected through
policy_vb = [],
q0_vb = [],
value0_vb = [],
avg_policy_vb = [])
detached_avg_policy_vb = [],
detached_old_policy_vb = [])

def _get_QretT_vb(self, on_policy=True):
if self.rollout.terminal1[-1]: # for terminal sT: Q_ret = 0
if on_policy:
QretT_vb = Variable(torch.zeros(1, 1))
else:
# TODO: check here again: for off-policy should be batch_size
QretT_vb = Variable(torch.zeros(self.master.batch_size, 1))
else: # for non-terminal sT: Qret = V(s_i; /theta)
sT_vb = self._preprocessState(self.rollout.state1[-1], True) # bootstrap from last state
if self.master.enable_continuous:
pass
else:
# NOTE: here get the output v instead of q to be as Q_ret
if self.master.enable_lstm:
if on_policy:
_, _, QretT_vb, _ = self.model(sT_vb, self.on_policy_lstm_hidden_vb) # NOTE: only doing inference here
else:
_, _, QretT_vb, _ = self.model(sT_vb, self.off_policy_lstm_hidden_vb) # NOTE: only doing inference here
else:
_, _, QretT_vb = self.model(sT_vb) # NOTE: only doing inference here
QretT_vb = Variable(QretT_vb.data)

return QretT_vb

def _backward(self):
def _1st_order_trpo(self, detached_policy_loss_vb, policy_vb, detached_policy_vb, detached_avg_policy_vb):
# KL divergence k = \delta_{\phi_{\theta}} DKL[ \pi(|\phi_{\theta_a}) || \pi{|\phi_{\theta}}]
kl_div_vb = F.kl_div(detached_policy_vb.log(), detached_avg_policy_vb, size_average=False)
k_vb = grad(outputs=kl_div_vb, inputs=detached_policy_vb, retain_graph=False, only_inputs=True)[0]
g_vb = grad(outputs=detached_policy_loss_vb, inputs=detached_policy_vb, retain_graph=False, only_inputs=True)[0]

kg_dot_vb = torch.mm(k_vb, torch.t(g_vb))
kk_dot_vb = torch.mm(k_vb, torch.t(k_vb))

z_star_vb = g_vb - ((kg_dot_vb - self.master.clip_1st_order_trpo) / kk_dot_vb).clamp(min=0) * k_vb
backward(variables=policy_vb, grad_variables=z_star_vb, retain_graph=True)
print("_1st_order_trpo done ===================")

def _backward(self, on_policy=True):
# preparation
rollout_steps = len(self.rollout.reward)
policy_vb = self.rollout.policy_vb
if self.master.enable_continuous:
pass
else:
action_batch_vb = Variable(torch.from_numpy(np.array(self.rollout.action)).long())
detached_policy_vb = [Variable(self.rollout.policy_vb[i].data, requires_grad=True) for i in range(rollout_steps)] # [rollout_steps x batch_size x action_dim]
action_batch_vb = Variable(torch.from_numpy(np.array(self.rollout.action)).view(rollout_steps, -1, 1).long()) # [rollout_steps x batch_size x 1]
if self.master.use_cuda:
action_batch_vb = action_batch_vb.cuda()
detached_policy_log_vb = [torch.log(detached_policy_vb[i]) for i in range(rollout_steps)]
# detached_entropy_vb = [- (detached_policy_log_vb[i] * detached_policy_vb[i]).sum(1) for i in range(rollout_steps)] # TODO: check if should keepdim
detached_policy_log_vb = [detached_policy_log_vb[i].gather(1, action_batch_vb[i]) for i in range(rollout_steps) ]
QretT_vb = self._get_QretT_vb(on_policy)

# compute loss
policy_loss_vb = Variable(torch.zeros(1, 1))
value_loss_vb = Variable(torch.zeros(1, 1))
for i in reversed(range(rollout_steps)):
pass
# 1. importance sampling weights: /rho = /pi(|s_i) / /mu(|s_i)
if on_policy: # 1 for on-policy
rho_vb = Variable(torch.ones(1, self.master.action_dim))
rho_vb[0,0] = 50#0.5#Variable(torch.ones(1, self.master.action_dim))
else:
pass

# # Break graph for last values calculated (used for targets, not directly as model outputs)
# if done:
# # Qret = 0 for terminal s
# Qret = Variable(torch.zeros(1, 1))
#
# if not args.on_policy:
# # Save terminal state for offline training
# memory.append(state, None, None, None)
# else:
# # Qret = V(s_i; theta) for non-terminal s
# _, _, Qret, _ = model(Variable(state), (hx, cx))
# Qret = Qret.detach()
#
# # Train the network on-policy
# _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies)
# Q_ret = r_i + /gamma * Q_ret
QretT_vb = self.master.gamma * QretT_vb + self.rollout.reward[i]
# A = Q_ret - V(s_i; /theta)
advantage_vb = QretT_vb - self.rollout.value0_vb[i]
# g = min(c, /rho_a_i) * /delta_theta * log(/pi(a_i|s_i; /theta)) * A
detached_policy_loss_vb = - (rho_vb.gather(1, action_batch_vb[i]).clamp(max=self.master.clip_trace) * detached_policy_log_vb[i] * advantage_vb.detach()).mean(0)

if self.master.enable_bias_correction:# and not on_policy: # NOTE: have to perform bais correction when off-policy
# g = g + /sum_a [1 - c / /rho_a]_+ /pi(a|s_i; /theta) * /delta_theta * log(/pi(a|s_i; /theta)) * (Q(s_i, a; /theta) - V(s_i; /theta)
bias_correction_coefficient_vb = (1 - self.master.clip_trace / rho_vb).clamp(min=0) * detached_policy_vb[i]
detached_policy_loss_vb -= (bias_correction_coefficient_vb * detached_policy_vb[i].log() * (self.rollout.q0_vb[i].detach() - self.rollout.value0_vb[i].detach())).sum(1, keepdim=True).mean(0)

if self.master.enable_1st_order_trpo:
# policy update d_/theta = d_/theta + /partical/theta / /partical/theta * z*
policy_loss_vb += self._1st_order_trpo(detached_policy_loss_vb, self.rollout.policy_vb[i], detached_policy_vb[i], self.rollout.detached_avg_policy_vb[i])

single_step_policy_loss = -(rho.gather(1, actions[i]).clamp(max=args.trace_max) * log_prob * A.detach()).mean(0) # Average over batch
# Off-policy bias correction
if off_policy:
# g = g + /sum_a [1 - c / /rho_a]_+ /pi(a|s_i; /theta) * /delta_theta * log(/pi(a|s_i; /theta)) * (Q(s_i, a; theta) - V(s_i; theta)
bias_weight = (1 - args.trace_max / rho).clamp(min=0) * policies[i]
single_step_policy_loss -= (bias_weight * policies[i].log() * (Qs[i].detach() - Vs[i].expand_as(Qs[i]).detach())).sum(1).mean(0)
if args.trust_region:
# Policy update d_/theta = d_/theta + /partical/theta / /partical/theta * z*
policy_loss += _trust_region_loss(model, policies[i], average_policies[i], single_step_policy_loss, args.trust_region_threshold)
else:
# Policy update d_/theta = d_/theta + partical_/theta / /partical_/theta * g
policy_loss += single_step_policy_loss
# Entropy regularisation d_/theta = d_/theta + /beta * /delta H(/pi(s_i; /theta))
policy_loss -= args.entropy_weight * -(policies[i].log() * policies[i]).sum(1).mean(0) # Sum over probabilities, average over batch

# compute loss
# loss_vb = Variable(torch.zeros(1))
Expand Down Expand Up @@ -240,7 +301,7 @@ def _on_policy_rollout(self, episode_steps, episode_reward):
self.rollout.policy_vb.append(p_vb)
self.rollout.q0_vb.append(q_vb)
self.rollout.value0_vb.append(v_vb)
self.rollout.avg_policy_vb.append(avg_p_vb)
self.rollout.detached_avg_policy_vb.append(avg_p_vb.detach()) # NOTE
# also push into replay buffer if off-policy learning is enabled
if self.master.replay_ratio > 0:
if self.rollout.terminal1[-1]:
Expand All @@ -252,7 +313,7 @@ def _on_policy_rollout(self, episode_steps, episode_reward):
self.memory.append(self.rollout.state0[-1],
self.rollout.action[-1],
self.rollout.reward[-1],
self.rollout.policy_vb[-1].data) # NOTE: no graphs needed
self.rollout.policy_vb[-1].detach()) # NOTE: no graphs needed

episode_steps += 1
episode_reward += self.experience.reward
Expand Down Expand Up @@ -290,7 +351,7 @@ def run(self):
# NOTE: on-policy learning # NOTE: procedure same as a3c, outs differ a bit
# sync in every step
self._sync_local_with_global()
self.optimizer.zero_grad()
self.model.zero_grad()

# start of a new episode
if should_start_new:
Expand Down Expand Up @@ -320,23 +381,23 @@ def run(self):
nepisodes_solved += 1

# calculate loss
self._backward() # NOTE: only train_step will increment inside _backward
self._backward(on_policy=True) # NOTE: only train_step will increment inside _backward
self.on_policy_train_step += 1
self.master.on_policy_train_step.value += 1

# NOTE: off-policy learning
# perfrom some off-policy training once got enough experience
if self.master.replay_ratio > 0 and len(self.memory) >= self.master.learn_start:
if self.master.replay_ratio > 0 and len(self.memory) >= self.master.replay_start:
# sample a number of off-policy episodes based on the replay ratio
for _ in range(sample_poisson(self.master.replay_ratio)):
# sync in every step
self._sync_local_with_global() # TODO: don't know if this is necessary here
self.optimizer.zero_grad()
self.model.zero_grad()

self._reset_off_policy_lstm_hidden_vb()
self._off_policy_rollout() # fake rollout, just to collect net outs from sampled trajectories
# calculate loss
self._backward() # NOTE: only train_step will increment inside _backward
self._backward(on_policy=False) # NOTE: only train_step will increment inside _backward
self.off_policy_train_step += 1
self.master.off_policy_train_step.value += 1

Expand Down
4 changes: 2 additions & 2 deletions core/models/acer_mlp_dis.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def forward(self, x, lstm_hidden_vb=None):
x, c = self.lstm(x, lstm_hidden_vb)
policy = self.actor_3(self.actor_2(x)).clamp(max=1-1e-20)
q = self.critic_2(x)
v = (q * policy).sum(1) # expectation of Q under /pi
v = (q * policy).sum(1, keepdim=True) # expectation of Q under /pi
if self.enable_lstm:
return policy, q, v, (x, c)
else:
return policy, q, v
return policy, q, v
2 changes: 1 addition & 1 deletion utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def loggerConfig(log_file, verbose=2):
# NOTE: used for on-policy methods for collect experiences over a rollout of an episode
# NOTE: policy_vb & value0_vb for storing output Variables along a rollout # NOTE: they should not be detached from the graph!
A3C_Experience = namedtuple('A3C_Experience', 'state0, action, reward, state1, terminal1, policy_vb, sigmoid_vb, value0_vb')
ACER_Experience = namedtuple('ACER_Experience', 'state0, action, reward, state1, terminal1, policy_vb, q0_vb, value0_vb, avg_policy_vb')
ACER_Experience = namedtuple('ACER_Experience', 'state0, action, reward, state1, terminal1, policy_vb, q0_vb, value0_vb, detached_avg_policy_vb, detached_old_policy_vb')

# NOTE: used for on-policy methods for collect experiences over a rollout of an episode
# NOTE: policy_vb & value0_vb for storing output Variables along a rollout # NOTE: they should not be detached from the graph!
Expand Down
16 changes: 13 additions & 3 deletions utils/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def __init__(self):
self.verbose = 0 # 0(warning) | 1(info) | 2(debug)

# training signature
self.machine = "daim" # "machine_id"
self.timestamp = "17081300" # "yymmdd##"
self.machine = "aisdaim" # "machine_id"
self.timestamp = "17082000" # "yymmdd##"
# training configuration
self.mode = 1 # 1(train) | 2(test model_file)
self.config = 7
Expand Down Expand Up @@ -71,6 +71,9 @@ def __init__(self):
self.use_cuda = False
self.dtype = torch.FloatTensor
elif self.agent_type == "acer":
self.enable_bias_correction = True
self.enable_1st_order_trpo = True

self.enable_lstm = True
if "-con" in self.model_type:
self.enable_continuous = True
Expand Down Expand Up @@ -179,6 +182,7 @@ def __init__(self):
self.gamma = 0.99
self.clip_grad = 1.#np.inf
self.lr = 0.0001
self.lr_decay = False
self.weight_decay = 0.
self.eval_freq = 2500 # NOTE: here means every this many steps
self.eval_steps = 1000
Expand All @@ -203,6 +207,7 @@ def __init__(self):
self.gamma = 0.99
self.clip_grad = 40.#np.inf
self.lr = 0.00025
self.lr_decay = False
self.weight_decay = 0.
self.eval_freq = 250000#12500 # NOTE: here means every this many steps
self.eval_steps = 125000#2500
Expand All @@ -226,6 +231,7 @@ def __init__(self):
self.gamma = 0.99
self.clip_grad = 40.
self.lr = 0.0001
self.lr_decay = False
self.weight_decay = 1e-4 if self.enable_continuous else 0.
self.eval_freq = 60 # NOTE: here means every this many seconds
self.eval_steps = 3000
Expand All @@ -241,15 +247,19 @@ def __init__(self):
self.gamma = 0.99
self.clip_grad = 40.
self.lr = 0.0001
self.lr_decay = True
self.weight_decay = 0.
self.eval_freq = 60 # NOTE: here means every this many seconds
self.eval_steps = 3000
self.prog_freq = self.eval_freq
self.test_nepisodes = 10

self.learn_start = 20000 # start off-policy learning after this many steps
self.replay_ratio = 0 # NOTE: 0: purely on-policy; otherwise mix with off-policy
self.replay_start = 20000 # start off-policy learning after this many steps
self.batch_size = 16
self.valid_size = 500
self.clip_trace = 10#np.inf# c in retrace
self.clip_1st_order_trpo = 1

self.rollout_steps = 20 # max look-ahead steps in a single rollout
self.tau = 1.
Expand Down

0 comments on commit ebc040f

Please sign in to comment.