Skip to content

Commit

Permalink
Fix evaluation action distribution
Browse files Browse the repository at this point in the history
Summary: Evaluation action distribution isn't computed correctly because it doesn't take possible_actions into account.

Reviewed By: kittipatv

Differential Revision: D15123599

fbshipit-source-id: f0429b9d82de23190ad65e6fa9f7c8e6838456e7
  • Loading branch information
czxttkl authored and facebook-github-bot committed May 1, 2019
1 parent c9b4c94 commit 6e607b1
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 20 deletions.
7 changes: 7 additions & 0 deletions ml/rl/evaluation/evaluation_data_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class EvaluationDataPage(NamedTuple):
model_values_for_logged_action: torch.Tensor
possible_actions_mask: torch.Tensor
optimal_q_values: Optional[torch.Tensor] = None
eval_action_idxs: Optional[torch.Tensor] = None
logged_values: Optional[torch.Tensor] = None
logged_metrics: Optional[torch.Tensor] = None
logged_metrics_values: Optional[torch.Tensor] = None
Expand Down Expand Up @@ -110,6 +111,8 @@ def create_from_tensors(
# as in discrete dqn model
model_values = trainer.q_network(possible_actions_state_concat).q_value
optimal_q_values = model_values
eval_action_idxs = None

assert (
model_values.shape[0] * model_values.shape[1]
== possible_actions_mask.shape[0] * possible_actions_mask.shape[1]
Expand Down Expand Up @@ -168,6 +171,9 @@ def create_from_tensors(
rewards = trainer.boost_rewards(rewards, actions)
model_values = trainer.q_network_cpe(states).q_values[:, 0:num_actions]
optimal_q_values = trainer.get_detached_q_values(states.state)[0]
eval_action_idxs = trainer.get_max_q_values(
optimal_q_values, possible_actions_mask
)[1]
model_propensities = masked_softmax(
optimal_q_values, possible_actions_mask, trainer.rl_temperature
)
Expand Down Expand Up @@ -287,6 +293,7 @@ def create_from_tensors(
logged_metrics_values=None,
possible_actions_mask=possible_actions_mask,
optimal_q_values=optimal_q_values,
eval_action_idxs=eval_action_idxs,
)

def append(self, edp):
Expand Down
34 changes: 18 additions & 16 deletions ml/rl/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,22 +88,24 @@ def evaluate_post_training(self, edp: EvaluationDataPage) -> CpeDetails:
metric, metric_reward_edp
)

if self.action_names is not None and edp.optimal_q_values is not None:
value_means = edp.optimal_q_values.mean(dim=0)
cpe_details.q_value_means = {
action: float(value_means[i])
for i, action in enumerate(self.action_names)
}
value_stds = edp.optimal_q_values.std(dim=0)
cpe_details.q_value_stds = {
action: float(value_stds[i])
for i, action in enumerate(self.action_names)
}
max_q_idxs = edp.optimal_q_values.argmax(dim=1)
cpe_details.action_distribution = {
action: float((max_q_idxs == i).sum()) / max_q_idxs.shape[0]
for i, action in enumerate(self.action_names)
}
if self.action_names is not None:
if edp.optimal_q_values is not None:
value_means = edp.optimal_q_values.mean(dim=0)
cpe_details.q_value_means = {
action: float(value_means[i])
for i, action in enumerate(self.action_names)
}
value_stds = edp.optimal_q_values.std(dim=0)
cpe_details.q_value_stds = {
action: float(value_stds[i])
for i, action in enumerate(self.action_names)
}
if edp.eval_action_idxs is not None:
cpe_details.action_distribution = {
action: float((edp.eval_action_idxs == i).sum())
/ edp.eval_action_idxs.shape[0]
for i, action in enumerate(self.action_names)
}

# Compute MC Loss on Aggregate Reward
cpe_details.mc_loss = float(
Expand Down
9 changes: 5 additions & 4 deletions ml/rl/training/dqn_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ def train(self, training_batch):
)
possible_actions_mask *= action_on_policy

model_action_idxs = self.get_max_q_values(
self.all_action_scores,
possible_actions_mask if self.maxq_learning else learning_input.action,
)[1]
self.loss_reporter.report(
td_loss=self.loss,
reward_loss=reward_loss,
Expand All @@ -205,10 +209,7 @@ def train(self, training_batch):
model_rewards=model_rewards,
model_values=self.all_action_scores,
model_values_on_logged_actions=None, # Compute at end of each epoch for CPE
model_action_idxs=self.get_max_q_values(
self.all_action_scores,
possible_actions_mask if self.maxq_learning else learning_input.action,
)[1],
model_action_idxs=model_action_idxs,
)

def calculate_cpes(
Expand Down

0 comments on commit 6e607b1

Please sign in to comment.