|
|
|
|
|
|
ModelUtils.list_to_tensor(batch["action_probs"]), |
|
|
|
loss_masks, |
|
|
|
) |
|
|
|
# Use the sum of entropy across actions, not the mean |
|
|
|
entropy_sum = torch.sum(entropy, dim=1) |
|
|
|
|
|
|
|
- decay_bet * ModelUtils.masked_mean(entropy_sum, loss_masks) |
|
|
|
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks) |
|
|
|
) |
|
|
|
|
|
|
|
# Set optimizer learning rate |
|
|
|
|
|
|
|
|
|
|
self.optimizer.step() |
|
|
|
update_stats = { |
|
|
|
"Losses/Policy Loss": policy_loss.item(), |
|
|
|
"Losses/Policy Loss": torch.abs(policy_loss).item(), |
|
|
|
"Losses/Value Loss": value_loss.item(), |
|
|
|
"Policy/Learning Rate": decay_lr, |
|
|
|
"Policy/Epsilon": decay_eps, |
|
|
|