Skip to content

Commit

Permalink
Merge branch 'master' into global-variables
Browse files Browse the repository at this point in the history
  • Loading branch information
anupambhatnagar committed Aug 12, 2020
2 parents 0e716c5 + cb44d9b commit 75e14ba
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 48 deletions.
3 changes: 2 additions & 1 deletion ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ def create_reward_signals(
self.reward_signals[reward_signal.value].update_dict
)

@classmethod
def create_optimizer_op(
self, learning_rate: tf.Tensor, name: str = "Adam"
cls, learning_rate: tf.Tensor, name: str = "Adam"
) -> tf.train.Optimizer:
return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)

Expand Down
8 changes: 4 additions & 4 deletions ml-agents/mlagents/trainers/ppo/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):

self.stream_names = list(self.reward_signals.keys())

self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None

Expand Down Expand Up @@ -291,9 +291,9 @@ def _create_losses(
)

def _create_ppo_optimizer_ops(self):
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss)
self.update_batch = self.tf_optimizer.minimize(self.loss)
self.tf_optimizer_op = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer_op.compute_gradients(self.loss)
self.update_batch = self.tf_optimizer_op.minimize(self.loss)

@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
Expand Down
7 changes: 4 additions & 3 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ def create_policy(

return policy

def create_ppo_optimizer(self) -> PPOOptimizer:
return PPOOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
Expand All @@ -225,9 +228,7 @@ def add_policy(
)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = PPOOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
self.optimizer = self.create_ppo_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly
Expand Down
7 changes: 4 additions & 3 deletions ml-agents/mlagents/trainers/sac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ def _update_reward_signals(self) -> None:
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))

def create_sac_optimizer(self) -> SACOptimizer:
return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
Expand All @@ -327,9 +330,7 @@ def add_policy(
)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = SACOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
self.optimizer = self.create_sac_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly
Expand Down
51 changes: 14 additions & 37 deletions ml-agents/mlagents/trainers/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,49 +96,26 @@ def write_stats(
if stats_summary.mean > 0.0:
is_training = "Training."

elapsed_time = time.time() - self.training_start_time
log_info: List[str] = [category]
log_info.append(f"Step: {step}")
log_info.append(f"Time Elapsed: {elapsed_time:0.3f} s")
if "Environment/Cumulative Reward" in values:
stats_summary = values["Environment/Cumulative Reward"]
if self.rank is not None:
logger.info(
"Rank: {}."
"{}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {:0.3f}"
". Std of Reward: {:0.3f}. {}".format(
self.rank(),
category,
step,
time.time() - self.training_start_time,
stats_summary.mean,
stats_summary.std,
is_training,
)
)
else:
logger.info(
"{}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {:0.3f}"
". Std of Reward: {:0.3f}. {}".format(
category,
step,
time.time() - self.training_start_time,
stats_summary.mean,
stats_summary.std,
is_training,
)
)
log_info.append(f"Rank: {self.rank}")

log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}")
log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
log_info.append(is_training)

if self.self_play and "Self-play/ELO" in values:
elo_stats = values["Self-play/ELO"]
logger.info(f"{category} ELO: {elo_stats.mean:0.3f}. ")
log_info.append(f"ELO: {elo_stats.mean:0.3f}")
else:
logger.info(
"{}: Step: {}. No episode was completed since last summary. {}".format(
category, step, is_training
)
)
log_info.append("No episode was completed since last summary")
log_info.append(is_training)
logger.info(". ".join(log_info))

def add_property(
self, category: str, property_type: StatsPropertyType, value: Any
Expand Down

0 comments on commit 75e14ba

Please sign in to comment.