[bug-fix] Change entropy computation and loss reporting in Torch to match TF (#4538)

* Proper dimensions for entropy, sum before bonus in PPO * Make entropy reporting same as TF * Always use separate critic * Revert to shared * Remove unneeded extra line * Change entropy shape in test * Change another entropy shape * Add entropy summing to evaluate_actions * Add notes about torch.abs(policy_loss)
4 年前 · e0ef30a5
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
            actions = actions[:, :, 0]
        else:
            actions = actions[:, 0, :]
-
-        return (actions, all_logs if all_log_probs else log_probs, entropies, memories)
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropies, dim=1)
+        return (
+            actions,
+            all_logs if all_log_probs else log_probs,
+            entropy_sum,
+            memories,
+        )

    def evaluate_actions(
        self,
        )
        action_list = [actions[..., i] for i in range(actions.shape[-1])]
        log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
-
-        return log_probs, entropies, value_heads
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropies, dim=1)
+        return log_probs, entropy_sum, value_heads

    @timed
    def evaluate(
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py

        self.optimizer.step()
        update_stats = {
-            "Losses/Policy Loss": policy_loss.item(),
+            # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
+            # TODO: After PyTorch is default, change to something more correct.
+            "Losses/Policy Loss": torch.abs(policy_loss).item(),
            "Losses/Value Loss": value_loss.item(),
            "Policy/Learning Rate": decay_lr,
            "Policy/Epsilon": decay_eps,
--- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py
        seq_len=policy.sequence_length,
    )
    assert log_probs.shape == (64, policy.behavior_spec.action_size)
-    assert entropy.shape == (64, policy.behavior_spec.action_size)
+    assert entropy.shape == (64,)
    for val in values.values():
        assert val.shape == (64,)

        )
    else:
        assert log_probs.shape == (64, policy.behavior_spec.action_shape)
-    assert entropies.shape == (64, policy.behavior_spec.action_size)
+    assert entropies.shape == (64,)

    if rnn:
        assert memories.shape == (1, 1, policy.m_size)
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
        if self.conditional_sigma:
            log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
        else:
-            log_sigma = self.log_sigma
+            # Expand so that entropy matches batch size
+            log_sigma = self.log_sigma.expand(inputs.shape[0], -1)
        if self.tanh_squash:
            return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
        else: