Use real clipping (as in TF)

4 年前 · 0548057d
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
            action, log_probs, entropy, memories = self.sample_actions(
                vec_obs, vis_obs, masks=masks, memories=memories
            )
-        run_out["action"] = ModelUtils.to_numpy(action)
+
+        clipped_action = torch.clamp(action, -3, 3) / 3
+        run_out["action"] = ModelUtils.to_numpy(clipped_action)
        # Todo - make pre_action difference
        run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
        vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
        if self.policy.use_continuous_act:
-            actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
+            actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)
        else:
            actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)

--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py

    def sample(self):
        sample = self.mean + torch.randn_like(self.mean) * self.std
-        return sample / 3
+        return sample
-        unscaled_val = value * 3  # Inverse of the clipping
-            -((unscaled_val - self.mean) ** 2) / (2 * var + EPSILON)
+            -((value - self.mean) ** 2) / (2 * var + EPSILON)
            - log_scale
            - math.log(math.sqrt(2 * math.pi))
        )