Skip to content

Commit

Permalink
call it a day
Browse files Browse the repository at this point in the history
1. removed WandB
2. optimized unity wrappers
  • Loading branch information
StepNeverStop committed Dec 4, 2020
1 parent 9f273ce commit 80707ad
Show file tree
Hide file tree
Showing 10 changed files with 489 additions and 471 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,6 @@ Options:
specify when to render the graphic interface of gym environment [default: None]
--info=<str> 抒写该训练的描述,用双引号包裹
write another information that describe this training task [default: None]
--use-wandb 是否上传数据到W&B
whether upload training log to WandB [default: False]
--hostname 是否在训练名称后附加上主机名称
whether concatenate hostname with the training name [default: False]
--no-save 指定是否在训练中保存模型、日志及训练数据
Expand Down
1 change: 0 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ train:
add_noise2buffer_episode_interval: 10 # episode interval when adding noise into replay buffer while training
add_noise2buffer_steps: 1000 # how many steps should be added into replay buffer
info: None
wandb_project: RLs
# off-policy
off_policy_train_interval: 1 # train policy every interval times

Expand Down
53 changes: 22 additions & 31 deletions rls/common/make_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,60 +8,51 @@
logger = get_logger(__name__)


def make_env(env_args: Dict):
def make_env(env_kargs: Dict):
logger.info('Initialize environment begin...')
if env_args['type'] == 'gym':
env = make_gym_env(env_args)
elif env_args['type'] == 'unity':
env = make_unity_env(env_args)
if env_kargs['type'] == 'gym':
env = make_gym_env(env_kargs)
elif env_kargs['type'] == 'unity':
env = make_unity_env(env_kargs)
else:
raise Exception('Unknown environment type.')
logger.info('Initialize environment successful.')
return env


def make_gym_env(env_args: Dict):
def make_gym_env(env_kargs: Dict):
from rls.envs.gym_env import gym_envs

env_kargs = deepcopy(env_args)
env = gym_envs(env_kargs)
copied_env_kargs = deepcopy(env_kargs)
env = gym_envs(copied_env_kargs)
return env


def make_unity_env(env_args: Dict):
from rls.envs.unity_wrapper import (UnityWrapper,
InfoWrapper,
UnityReturnWrapper,
def make_unity_env(env_kargs: Dict):
from rls.envs.unity_wrapper import (BasicUnityEnvironment,
GrayVisualWrapper,
ResizeVisualWrapper,
ScaleVisualWrapper,
ActionWrapper,
BasicActionWrapper,
StackVisualWrapper)

env_kargs = deepcopy(env_args)
env = UnityWrapper(env_kargs)
logger.debug('Unity UnityWrapper success.')
copied_env_kargs = deepcopy(env_kargs)
env = BasicUnityEnvironment(copied_env_kargs)
logger.debug('Unity BasicUnityEnvironment success.')

env = InfoWrapper(env, env_args)
logger.debug('Unity InfoWrapper success.')

if env_kargs['obs_grayscale']:
if copied_env_kargs['obs_grayscale']:
env = GrayVisualWrapper(env)

if env_kargs['obs_resize']:
env = ResizeVisualWrapper(env, resize=env_kargs['resize'])
if copied_env_kargs['obs_resize']:
env = ResizeVisualWrapper(env, resize=copied_env_kargs['resize'])

if env_kargs['obs_scale']:
if copied_env_kargs['obs_scale']:
env = ScaleVisualWrapper(env)

env = UnityReturnWrapper(env)

if env_kargs['obs_stack']:
env = StackVisualWrapper(env, stack_nums=env_args['stack_visual_nums'])

env = ActionWrapper(env)
logger.debug('Unity ActionWrapper success.')
if copied_env_kargs['obs_stack']:
env = StackVisualWrapper(env, stack_nums=env_kargs['stack_visual_nums'])

env.initialize()
env = BasicActionWrapper(env)
logger.debug('Unity BasicActionWrapper success.')

return env
194 changes: 90 additions & 104 deletions rls/common/train/unity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from rls.utils.np_utils import (SMA,
arrprint)
from rls.utils.list_utils import zeros_initializer
from rls.utils.mlagents_utils import (multi_agents_data_preprocess,
multi_agents_action_reshape)
from rls.utils.logging_utils import get_logger
Expand All @@ -18,7 +17,7 @@
bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'


def unity_train(env, models,
def unity_train(env, model,
print_func: Callable[[str], None],
begin_train_step: int,
begin_frame_step: int,
Expand All @@ -40,7 +39,7 @@ def unity_train(env, models,
Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
Inputs:
env: Environment for interaction.
models: all models for this training task.
model: all model for this training task.
save_frequency: how often to save checkpoints.
max_step_per_episode: maximum number of steps for an episode.
resampling_interval: how often to resample parameters for env reset.
Expand All @@ -53,59 +52,55 @@ def unity_train(env, models,
rewards: use to record rewards of agents for each group.
"""

state, visual_state, action, dones_flag, rewards = zeros_initializer(env.group_num, 5)
sma = [SMA(moving_average_episode) for i in range(env.group_num)]
sma = SMA(moving_average_episode)
frame_step = begin_frame_step
min_of_all_agents = min(env.group_agents)
train_step = [begin_train_step for _ in range(env.group_num)]
train_step = begin_train_step
n = env.group_agents[env.first_gn]

for episode in range(begin_episode, max_train_episode):
[model.reset() for model in models]
ObsRewDone = zip(*env.reset())
for i, (_v, _vs, _r, _d, _info, _corrected_v, _correcred_vs) in enumerate(ObsRewDone):
dones_flag[i] = np.zeros(env.group_agents[i])
rewards[i] = np.zeros(env.group_agents[i])
state[i] = _corrected_v
visual_state[i] = _correcred_vs
model.reset()
ret = env.reset()[env.first_gn]
s = ret.corrected_vector
visual_s = ret.corrected_visual
dones_flag = np.zeros(n, dtype=float)
rewards = np.zeros(n, dtype=float)
step = 0
last_done_step = -1

while True:
step += 1
for i in range(env.group_num):
action[i] = models[i].choose_action(s=state[i], visual_s=visual_state[i])
actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.group_names)}
ObsRewDone = zip(*env.step(actions))

for i, (_v, _vs, _r, _d, _info, _corrected_v, _correcred_vs) in enumerate(ObsRewDone):
models[i].store_data(
s=state[i],
visual_s=visual_state[i],
a=action[i],
r=_r,
s_=_v,
visual_s_=_vs,
done=_info['real_done'] if real_done else _d
)
models[i].partial_reset(_d)
rewards[i] += (1 - dones_flag[i]) * _r
dones_flag[i] = np.sign(dones_flag[i] + _d)
state[i] = _corrected_v
visual_state[i] = _correcred_vs
if policy_mode == 'off-policy':
if train_step[i] % off_policy_train_interval == 0:
models[i].learn(episode=episode, train_step=train_step[i])
train_step[i] += 1
if train_step[i] % save_frequency == 0:
models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step)

frame_step += min_of_all_agents
if 0 < max_train_step < min(train_step) or 0 < max_frame_step < frame_step:
for i in range(env.group_num):
models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step)
action = model.choose_action(s=s, visual_s=visual_s)
ret = env.step({env.first_gn: action})[env.first_gn]

model.no_op_store(
s=s,
visual_s=visual_s,
a=action,
r=ret.reward,
s_=ret.vector,
visual_s_=ret.visual,
done=ret.info['real_done'] if real_done else ret.done
)
model.partial_reset(ret.done)
rewards += (1 - dones_flag) * ret.reward
dones_flag = np.sign(dones_flag + ret.done)
s = ret.corrected_vector
visual_s = ret.corrected_visual

if policy_mode == 'off-policy':
if train_step % off_policy_train_interval == 0:
model.learn(episode=episode, train_step=train_step)
train_step += 1
if train_step % save_frequency == 0:
model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)

frame_step += n
if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
logger.info(f'End Training, learn step: {train_step}, frame_step: {frame_step}')
return

if all([all(dones_flag[i]) for i in range(env.group_num)]):
if all(dones_flag):
if last_done_step == -1:
last_done_step = step
if policy_mode == 'off-policy':
Expand All @@ -114,31 +109,29 @@ def unity_train(env, models,
if step >= max_step_per_episode:
break

for i in range(env.group_num):
sma[i].update(rewards[i])
if policy_mode == 'on-policy':
models[i].learn(episode=episode, train_step=train_step[i])
train_step[i] += 1
if train_step[i] % save_frequency == 0:
models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step)
models[i].writer_summary(
episode,
reward_mean=rewards[i].mean(),
reward_min=rewards[i].min(),
reward_max=rewards[i].max(),
step=last_done_step,
**sma[i].rs
)
sma.update(rewards)
if policy_mode == 'on-policy':
model.learn(episode=episode, train_step=train_step)
train_step += 1
if train_step % save_frequency == 0:
model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
model.writer_summary(
episode,
reward_mean=rewards.mean(),
reward_min=rewards.min(),
reward_max=rewards.max(),
step=last_done_step,
**sma.rs
)
print_func(f'Eps {episode:3d} | S {step:4d} | LDS {last_done_step:4d}', out_time=True)
for i, gn in enumerate(env.group_names):
print_func(f'{gn} R: {arrprint(rewards[i], 2)}')
print_func(f'{env.first_gn} R: {arrprint(rewards, 2)}')

if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
unity_no_op(env, models, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, real_done=real_done,
unity_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, real_done=real_done,
desc='adding noise')


def unity_no_op(env, models,
def unity_no_op(env, model,
pre_fill_steps: int,
prefill_choose: bool,
real_done: bool,
Expand All @@ -148,58 +141,50 @@ def unity_no_op(env, models,
Make sure steps is greater than n-step if using any n-step ReplayBuffer.
'''
assert isinstance(pre_fill_steps, int) and pre_fill_steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'
min_of_all_agents = min(env.group_agents)
n = env.group_agents[env.first_gn]

if pre_fill_steps == 0:
return
model.reset()
ret = env.reset()[env.first_gn]
s = ret.corrected_vector
visual_s = ret.corrected_visual

state, visual_state, action = zeros_initializer(env.group_num, 3)

[model.reset() for model in models]
ObsRewDone = zip(*env.reset())
for i, (_v, _vs, _r, _d, _info, _corrected_v, _correcred_vs) in enumerate(ObsRewDone):
state[i] = _corrected_v
visual_state[i] = _correcred_vs

for _ in trange(0, pre_fill_steps, min_of_all_agents, unit_scale=min_of_all_agents, ncols=80, desc=desc, bar_format=bar_format):
for _ in trange(0, pre_fill_steps, n, unit_scale=n, ncols=80, desc=desc, bar_format=bar_format):
if prefill_choose:
for i in range(env.group_num):
action[i] = models[i].choose_action(s=state[i], visual_s=visual_state[i])
action = model.choose_action(s=s, visual_s=visual_s)
else:
action = env.random_action()
actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.group_names)}
ObsRewDone = zip(*env.step(actions))
for i, (_v, _vs, _r, _d, _info, _corrected_v, _correcred_vs) in enumerate(ObsRewDone):
models[i].no_op_store(
s=state[i],
visual_s=visual_state[i],
a=action[i],
r=_r,
s_=_v,
visual_s_=_vs,
done=_info['real_done'] if real_done else _d
)
models[i].partial_reset(_d)
state[i] = _corrected_v
visual_state[i] = _correcred_vs
action = env.random_action()[env.first_gn]
ret = env.step({env.first_gn: action})[env.first_gn]
model.no_op_store(
s=s,
visual_s=visual_s,
a=action,
r=ret.reward,
s_=ret.vector,
visual_s_=ret.visual,
done=ret.info['real_done'] if real_done else ret.done
)
model.partial_reset(ret.done)
s = ret.corrected_vector
visual_s = ret.corrected_visual


def unity_inference(env, models,
def unity_inference(env, model,
episodes: int) -> NoReturn:
"""
inference mode. algorithm model will not be train, only used to show agents' behavior
"""
action = zeros_initializer(env.group_num, 1)

for episode in range(episodes):
[model.reset() for model in models]
ObsRewDone = zip(*env.reset())
model.reset()
ret = env.reset()[env.first_gn]
while True:
for i, (_v, _vs, _r, _d, _info, _corrected_v, _correcred_vs) in enumerate(ObsRewDone):
action[i] = models[i].choose_action(s=_corrected_v, visual_s=_correcred_vs, evaluation=True)
models[i].partial_reset(_d)
actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.group_names)}
ObsRewDone = zip(*env.step(actions))
action = model.choose_action(s=ret.corrected_vector,
visual_s=ret.corrected_visual,
evaluation=True)
model.partial_reset(ret.done)
ret = env.step({env.first_gn: action})[env.first_gn]


def ma_unity_no_op(env, model,
Expand All @@ -225,9 +210,10 @@ def ma_unity_no_op(env, model,
if prefill_choose:
action = model.choose_action(s=s, visual_s=visual_s) # [total_agents, batch, dimension]
action = action_reshape_func(action)
actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.group_names)}
else:
action = env.random_action()
actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.group_names)}
actions = env.random_action()
action = list(actions.values())
s_, visual_s_, r, done, info, corrected_s_, corrected_visual_s_ = env.step(actions)
if real_done:
done = [g['real_done'] for g in info]
Expand Down
Loading

0 comments on commit 80707ad

Please sign in to comment.