Update ml-agents/mlagents/trainers/torch/distributions.py

merging master
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            behavior_spec,
            self.trainer_settings,
            condition_sigma_on_obs=False,  # Faster training for PPO
+            tanh_squash=True,
            separate_critic=True,  # Match network architecture with TF
        )
        return policy
--- a/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
        PPO_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
-        max_steps=10000,
+        max_steps=15000,
    )
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
 def test_hybrid_sac(action_size):
-    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.5)

    new_hyperparams = attr.evolve(
        SAC_TORCH_CONFIG.hyperparameters,
    )
    config = attr.evolve(
-        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=2200
-    )
-    check_environment_trains(
-        env, {BRAIN_NAME: config}, success_threshold=0.9, training_seed=1336
+        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=2500
+    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


@pytest.mark.parametrize("num_visual", [1, 2])
--- a/ml-agents/mlagents/trainers/torch/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py
        memories = []
        if self.policy.use_recurrent:
            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
-
        selected_actions, log_probs, _, _ = self.policy.sample_actions(
            tensor_obs,
            masks=act_masks,
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
        return squashed

    def _inverse_tanh(self, value):
-        capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON)
-        return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON)
+        return 0.5 * torch.log((1 + value) / (1 - value) + EPSILON)
+        value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON)
-        return super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian(
+        result = super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian(
+        return torch.clamp(result, -20, 20)


 class CategoricalDistInstance(DiscreteDistInstance):
            bias_init=Initialization.Zero,
        )
        self.tanh_squash = tanh_squash
-        if conditional_sigma:
+        if self.conditional_sigma:
            self.log_sigma = linear_layer(
                hidden_size,
                num_outputs,
            )
+
-                torch.zeros(1, num_outputs, requires_grad=True)
+                torch.ones(1, num_outputs, requires_grad=True)
+            torch.nn.init.constant_(self.log_sigma.data, -1.1)
+            # Note: we initialize the output of log_sigma around log(1/3)

    def forward(self, inputs: torch.Tensor) -> List[DistInstance]:
        mu = self.mu(inputs)
            # Expand so that entropy matches batch size. Note that we're using
            # mu*0 here to get the batch size implicitly since Barracuda 1.2.1
            # throws error on runtime broadcasting due to unknown reason. We
-            # use this to replace torch.expand() becuase it is not supported in
+            # use this to replace torch.expand() because it is not supported in
-            log_sigma = mu * 0 + self.log_sigma
+            log_sigma = mu * 0 + torch.clamp(self.log_sigma, -20, 2)
        if self.tanh_squash:
            return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
        else:
作者	SHA1	备注	提交日期
GitHub	3c1e98ca	Update ml-agents/mlagents/trainers/torch/distributions.py	4 年前
vincentpierre	bcec7303	merging master	4 年前