_

4 年前 · 6f3ea7b8
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
-        tanh_squash: bool = True,
+        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ):
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
-        tanh_squash: bool = True,
+        tanh_squash: bool = False,
        reparameterize: bool = False,
        separate_critic: bool = True,
        condition_sigma_on_obs: bool = True,
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
        :param num_sequences: Number of sequences to process.
        :return: Results of update.
        """
-        with torch.autograd.detect_anomaly():
-            # Get decayed parameters
-            decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
-            decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
-            decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
-            returns = {}
-            old_values = {}
-            for name in self.reward_signals:
-                old_values[name] = ModelUtils.list_to_tensor(
-                    batch[f"{name}_value_estimates"]
-                )
-                returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
+        # Get decayed parameters
+        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
+        decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
+        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
+        returns = {}
+        old_values = {}
+        for name in self.reward_signals:
+            old_values[name] = ModelUtils.list_to_tensor(
+                batch[f"{name}_value_estimates"]
+            )
+            returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
-            n_obs = len(self.policy.behavior_spec.observation_specs)
-            current_obs = ObsUtil.from_buffer(batch, n_obs)
-            # Convert to tensors
-            current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+        current_obs = ObsUtil.from_buffer(batch, n_obs)
+        # Convert to tensors
+        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
-            act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
-            actions = AgentAction.from_dict(batch)
+        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
+        actions = AgentAction.from_dict(batch)
-            memories = [
-                ModelUtils.list_to_tensor(batch["memory"][i])
-                for i in range(0, len(batch["memory"]), self.policy.sequence_length)
-            ]
-            if len(memories) > 0:
-                memories = torch.stack(memories).unsqueeze(0)
+        memories = [
+            ModelUtils.list_to_tensor(batch["memory"][i])
+            for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+        ]
+        if len(memories) > 0:
+            memories = torch.stack(memories).unsqueeze(0)
-            log_probs, entropy, values = self.policy.evaluate_actions(
-                current_obs,
-                masks=act_masks,
-                actions=actions,
-                memories=memories,
-                seq_len=self.policy.sequence_length,
-            )
-            old_log_probs = ActionLogProbs.from_dict(batch).flatten()
-            log_probs = log_probs.flatten()
-            loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
-            value_loss = self.ppo_value_loss(
-                values, old_values, returns, decay_eps, loss_masks
-            )
-            # print(log_probs)
-            policy_loss = self.ppo_policy_loss(
-                ModelUtils.list_to_tensor(batch["advantages"]),
-                log_probs,
-                old_log_probs,
-                loss_masks,
-            )
-            loss = (
-                policy_loss
-                + 0.5 * value_loss
-                - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
-            )
+        log_probs, entropy, values = self.policy.evaluate_actions(
+            current_obs,
+            masks=act_masks,
+            actions=actions,
+            memories=memories,
+            seq_len=self.policy.sequence_length,
+        )
+        old_log_probs = ActionLogProbs.from_dict(batch).flatten()
+        log_probs = log_probs.flatten()
+        loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
+        value_loss = self.ppo_value_loss(
+            values, old_values, returns, decay_eps, loss_masks
+        )
+        policy_loss = self.ppo_policy_loss(
+            ModelUtils.list_to_tensor(batch["advantages"]),
+            log_probs,
+            old_log_probs,
+            loss_masks,
+        )
+        loss = (
+            policy_loss
+            + 0.5 * value_loss
+            - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
+        )
-            # Set optimizer learning rate
-            ModelUtils.update_learning_rate(self.optimizer, decay_lr)
-            self.optimizer.zero_grad()
-        with torch.autograd.detect_anomaly():
-            loss.backward()
+        # Set optimizer learning rate
+        ModelUtils.update_learning_rate(self.optimizer, decay_lr)
+        self.optimizer.zero_grad()
+        loss.backward()

        self.optimizer.step()
        update_stats = {
--- a/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
        PPO_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
-        max_steps=10000,
+        max_steps=15000,
    )
    check_environment_trains(
        env, {BRAIN_NAME: config}, success_threshold=0.9

@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
 def test_hybrid_sac(action_size):
-    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.5)

    new_hyperparams = attr.evolve(
        SAC_TORCH_CONFIG.hyperparameters,
    )
    config = attr.evolve(
-        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=2200
+        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=2500
-        env, {BRAIN_NAME: config}, success_threshold=0.9, training_seed=1336
+        env, {BRAIN_NAME: config}, success_threshold=0.9
    )


--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
        discrete_dist: Optional[List[DiscreteDistInstance]] = None
        # This checks None because mypy complains otherwise
        if self._continuous_distribution is not None:
-            if (torch.isnan(torch.mean(inputs))):
-                print("_get_dist inputs in action_model")
            continuous_dist = self._continuous_distribution(inputs)
        if self._discrete_distribution is not None:
            discrete_dist = self._discrete_distribution(inputs, masks)
        :params actions: The AgentAction
        :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
        """
-        if (torch.isnan(torch.mean(inputs))):
-            print("evaluate inputs in action_model")
        dists = self._get_dists(inputs, masks)
        log_probs, entropies = self._get_probs_and_entropy(actions, dists)
        # Use the sum of entropy across actions, not the mean
        :params masks: Action masks for discrete actions
        :return: A tuple of torch tensors corresponding to the inference output
        """
-        if (torch.isnan(torch.mean(inputs))):
-            print("get_action_out inputs in action_model")
        dists = self._get_dists(inputs, masks)
        continuous_out, discrete_out, action_out_deprecated = None, None, None
        if self.action_spec.continuous_size > 0 and dists.continuous is not None:
        :return: Given the input, an AgentAction of the actions generated by the policy and the corresponding
        ActionLogProbs and entropies.
        """
-        if (torch.isnan(torch.mean(inputs))):
-            print("forward inputs in action_model")
        dists = self._get_dists(inputs, masks)
        actions = self._sample_action(dists)
        log_probs, entropies = self._get_probs_and_entropy(actions, dists)
--- a/ml-agents/mlagents/trainers/torch/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py
        memories = []
        if self.policy.use_recurrent:
            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
-
        selected_actions, log_probs, _, _ = self.policy.sample_actions(
            tensor_obs,
            masks=act_masks,
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
    def __init__(self, mean, std):
        super().__init__(mean, std)
        self.transform = torch.distributions.transforms.TanhTransform(cache_size=1)
-        if torch.isnan(torch.mean(std)):
-            print("Nan in TanhGaussianDistInstance init, std")

    def sample(self):
        unsquashed_sample = super().sample()
        return 0.5 * torch.log((1 + value) / (1 - value) + EPSILON)

    def log_prob(self, value):
-        unsquashed = self._inverse_tanh(value * 0.95)
-        # unsquashed = self.transform.inv(value * 0.85)
-
-
-        # capped_unsqached = self.transform.inv(capped_value)
-        tmp=  super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian(
+        value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON)
+        unsquashed = self.transform.inv(value)
+        result = super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian(
-        # print("tmp decomposition", value, capped_value, unsquashed, super().log_prob(unsquashed) , self.transform.log_abs_det_jacobian(
-        #     unsquashed, None
-        # ))
-        if torch.isnan(torch.mean(value)):
-            print("Nan in log_prob(self, value), value")
-        if torch.isnan(torch.mean(super().log_prob(unsquashed))):
-            print("Nan in log_prob(self, value), super().log_prob(unsquashed)")
-        if torch.isnan(torch.mean(self.transform.log_abs_det_jacobian(unsquashed, None ))):
-            print("Nan in log_prob(self, value), log_abs_det_jacobian")
-        return tmp
-
-        def exported_model_output(self):
-            return self.sample()
+        return torch.clamp(result, -1e6, 0)


 class CategoricalDistInstance(DiscreteDistInstance):
            bias_init=Initialization.Zero,
        )
        self.tanh_squash = tanh_squash
-        if conditional_sigma:
+        if self.conditional_sigma:
            self.log_sigma = linear_layer(
                hidden_size,
                num_outputs,
            )
-            torch.nn.init.constant_(self.log_sigma.bias.data, -1)
-            torch.nn.init.constant_(self.log_sigma.data, -1)
+            torch.nn.init.constant_(self.log_sigma.data, -1.1)
+            # Note: we initialize the output of log_sigma around log(1/3)
-            if torch.isnan(torch.mean(inputs)):
-                print("GaussianDistribution conditional log sigma inputs")
            log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
        else:
            # Expand so that entropy matches batch size. Note that we're using
            # the verified version of Barracuda (1.0.X).
-            log_sigma = mu * 0 + self.log_sigma
-            if torch.isnan(torch.mean(self.log_sigma)):
-                print("GaussianDistribution self.log_sigma")
-            if torch.isnan(torch.mean(mu)):
-                print("GaussianDistribution mu")
-            if torch.isnan(torch.mean(inputs)):
-                print("GaussianDistribution inputs")
-
-        if torch.isnan(torch.mean(log_sigma)):
-            print("GaussianDistribution log sigma NaN")
+            # log_sigma = mu * 0 + self.log_sigma
+            log_sigma = mu * 0 + torch.clamp(self.log_sigma, -20, 2)
        if self.tanh_squash:
            return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
        else:
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py

    def forward(
        self,
-        inputs_: List[torch.Tensor],
+        inputs: List[torch.Tensor],
        actions: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-            obs_input = inputs_[idx]
+            obs_input = inputs[idx]
            processed_obs = processor(obs_input)
            encodes.append(processed_obs)

            encoding, memories = self.lstm(encoding, memories)
            encoding = encoding.reshape([-1, self.m_size // 2])

-        if torch.isnan(torch.mean(encoding)):
-            print("NaN in Netowrk Body :")
-            print(torch.mean(inputs_[0]), torch.mean(self.processors[0](inputs_[0])))
-
-            print(self.processors[0].conv_layers[0].weight.data)
-            print(self.processors[0].conv_layers[2].weight.data)
-
-            print("\n\n\n\n\n")
-            raise _
        return encoding, memories


        encoding, memories = self.network_body(
            inputs, memories=memories, sequence_length=sequence_length
        )
-        if torch.isnan(torch.mean(encoding)):
-            print("SimpleActor encoding in get_action_stats")
        action, log_probs, entropies = self.action_model(encoding, masks)
        return action, log_probs, entropies, memories

        encoding, memories = self.network_body(
            inputs, memories=memories, sequence_length=sequence_length
        )
-        if torch.isnan(torch.mean(encoding)):
-            print("SharedActorCritic, get_stats_and_value, encoding")
        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
        value_outputs = self.value_heads(encoding)
        return log_probs, entropies, value_outputs
        sequence_length: int = 1,
    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
        actor_mem, critic_mem = self._get_actor_critic_mem(memories)
-        for i in inputs:
-            if torch.isnan(torch.mean(i)):
-                print("Nan input to network body in SeparateActorCritic")
-        if torch.isnan(torch.mean(encoding)):
-            print("SeparateActorCritic, get_stats_and_value, encoding")
        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
        value_outputs, critic_mem_outs = self.critic(
            inputs, memories=critic_mem, sequence_length=sequence_length