diff --git a/reinforcement_learning/actor_critic.py b/reinforcement_learning/actor_critic.py index cebec8a92a..c415db9e6b 100644 --- a/reinforcement_learning/actor_critic.py +++ b/reinforcement_learning/actor_critic.py @@ -69,7 +69,7 @@ def finish_episode(): R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) - rewards = (rewards - rewards.mean()) / rewards.std() + rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for (action, value), r in zip(saved_actions, rewards): action.reinforce(r - value.data.squeeze()) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) diff --git a/reinforcement_learning/reinforce.py b/reinforcement_learning/reinforce.py index e7e04aa580..5dffa0f880 100644 --- a/reinforcement_learning/reinforce.py +++ b/reinforcement_learning/reinforce.py @@ -65,7 +65,7 @@ def finish_episode(): R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) - rewards = (rewards - rewards.mean()) / rewards.std() + rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, r in zip(model.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad()