Proper dimensions for entropy, sum before bonus in PPO

4 年前 · e8431a6d
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
            ModelUtils.list_to_tensor(batch["action_probs"]),
            loss_masks,
        )
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropy, dim=1)
-            - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
+            - decay_bet * ModelUtils.masked_mean(entropy_sum, loss_masks)
        )

        # Set optimizer learning rate
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
        if self.conditional_sigma:
            log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
        else:
-            log_sigma = self.log_sigma
+            # Expand so that entropy matches batch size
+            log_sigma = self.log_sigma.expand(inputs.shape[0], -1)
        if self.tanh_squash:
            return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
        else: