might be right

4 年前 · 6e1826f8
--- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
        next_critic_obs: List[List[np.ndarray]],
        done: bool,
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+
+
+        next_obs = ObsUtil.from_buffer_next(batch, n_obs)
+        team_obs = TeamObsUtil.from_buffer(batch, n_obs)
+        next_team_obs = TeamObsUtil.from_buffer_next(batch, n_obs)
-
-        memory = torch.zeros([1, 1, self.policy.m_size])
+        team_obs = [
+            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
+            for _teammate_obs in team_obs
+        ]
+        next_team_obs = [
+            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
+            for _teammate_obs in next_team_obs
+        ]
-        next_obs = [obs.unsqueeze(0) for obs in next_obs]
+        next_actions = AgentAction.from_dict_next(batch)
+        team_actions = AgentAction.from_team_dict(batch)
+        next_team_actions = AgentAction.from_team_dict_next(batch)
-        critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
-        critic_obs = [
-            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
-            for _teammate_obs in critic_obs
-        ]
-        next_critic_obs = [
-            ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
-        ]
+        # next_obs = [obs.unsqueeze(0) for obs in next_obs]
+
+        # critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
+        # critic_obs = [
+        #    [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
+        #    for _teammate_obs in critic_obs
+        # ]
+        # next_critic_obs = [
+        #    ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
+        # ]
-        next_critic_obs = [
-            [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
-        ]
+        # next_critic_obs = [
+        #    [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
+        # ]
-        value_estimates, marg_val_estimates, next_memory = self.policy.actor_critic.critic_pass(
+        value_estimates, marg_val_estimates, mem = self.policy.actor_critic.critic_pass(
-            critic_obs=critic_obs,
+            team_obs=team_obs,
+            team_act=team_actions,
+        )
+        next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.critic_pass(
+            next_obs,
+            next_actions,
+            memory,
+            sequence_length=batch.num_experiences,
+            team_obs=next_team_obs,
+            team_act=next_team_actions,
        )

        # # Actions is a hack here, we need the next actions
        # These aren't used in COMAttention
-        next_value_estimate, next_marg_val_estimate = {}, {}
-            next_value_estimate[name] = 0.0
+        for name, estimate in next_value_estimates.items():
+            next_value_estimates[name] = ModelUtils.to_numpy(estimate)
-            next_marg_val_estimate[name] = 0.0
+        for name, estimate in next_marg_val_estimates.items():
+            next_marg_val_estimates[name] = ModelUtils.to_numpy(estimate)
-            for k in next_value_estimate:
+            for k in next_value_estimates:
-                    next_value_estimate[k] = 0.0
+                    next_value_estimates[k] = 0.0
-            next_value_estimate,
-            next_marg_val_estimate,
+            next_value_estimates,
+            next_marg_val_estimates,
        )
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
+        team_obs: Optional[List[List[torch.Tensor]]] = None,
+        team_act: Optional[List[AgentAction]] = None,
-            obs, actions, masks, memories, seq_len, critic_obs
+            obs, actions, masks, memories, seq_len, team_obs, team_act
        )
        return log_probs, entropies, value_heads, marg_vals

--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
        value_losses = []
        for name, head in values.items():
            old_val_tensor = old_values[name]
-            returns_tensor = returns[name]
-            clipped_value_estimate = old_val_tensor + torch.clamp(
-                head - old_val_tensor, -1 * epsilon, epsilon
-            )
-            v_opt_a = (returns_tensor - head) ** 2
-            v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
-            value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
+            returns_tensor = returns[name] + 0.99 * old_val_tensor
+            # clipped_value_estimate = old_val_tensor + torch.clamp(
+            #    head - old_val_tensor, -1 * epsilon, epsilon
+            # )
+            value_loss = (returns_tensor - head) ** 2
+            # v_opt_a = (returns_tensor - head) ** 2
+            # v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
+            # value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
            value_losses.append(value_loss)
        value_loss = torch.mean(torch.stack(value_losses))
        return value_loss
        old_marg_values = {}
        for name in self.reward_signals:
            old_values[name] = ModelUtils.list_to_tensor(
-                batch[f"{name}_value_estimates"]
+                batch[f"{name}_value_estimates_next"]
-                batch[f"{name}_marginalized_value_estimates"]
+                batch[f"{name}_marginalized_value_estimates_next"]
            )
            returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])

        # Convert to tensors
        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]

-        critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
-        critic_obs = [
+        team_obs = TeamObsUtil.from_buffer(batch, n_obs)
+        team_obs = [
-            for _teammate_obs in critic_obs
+            for _teammate_obs in team_obs
-        next_team_actions = AgentAction.from_team_dict_next(batch)
+        # next_team_actions = AgentAction.from_team_dict_next(batch)

        memories = [
            ModelUtils.list_to_tensor(batch["memory"][i])
            masks=act_masks,
            actions=actions,
            memories=memories,
-            critic_obs=critic_obs,
+            team_obs=team_obs,
+            team_act=team_actions,
            seq_len=self.policy.sequence_length,
        )
        old_log_probs = ActionLogProbs.from_dict(batch).flatten()
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
                np.mean(v),
            )
+        for name, v in value_next.items():
+            agent_buffer_trajectory[f"{name}_value_estimates_next"].extend(v)
+            agent_buffer_trajectory[f"{name}_marginalized_value_estimates_next"].extend(
+                marg_value_next[name]
+            )

        # Evaluate all reward functions
        self.collected_rewards["environment"][agent_id] += np.sum(
--- a/ml-agents/mlagents/trainers/torch/agent_action.py
+++ b/ml-agents/mlagents/trainers/torch/agent_action.py
        return AgentAction(continuous, discrete)

    @staticmethod
+    def from_dict_next(buff: Dict[str, np.ndarray]) -> "AgentAction":
+        """
+        A static method that accesses continuous and discrete action fields in an AgentBuffer
+        and constructs the corresponding AgentAction from the retrieved np arrays.
+        """
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None  # type: ignore
+        if "next_continuous_action" in buff:
+            continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
+        if "next_discrete_action" in buff:
+            discrete_tensor = ModelUtils.list_to_tensor(
+                buff["discrete_action"], dtype=torch.long
+            )
+            discrete = [
+                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
+            ]
+        return AgentAction(continuous, discrete)
+
+    @staticmethod
    def _from_team_dict(
        buff: Dict[str, np.ndarray], cont_action_key: str, disc_action_key: str
    ):
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
+        team_obs: Optional[List[List[torch.Tensor]]] = None,
+        team_act: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
        encoding, memories = self.network_body(
            inputs, memories=memories, sequence_length=sequence_length
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
+        team_obs: Optional[List[List[torch.Tensor]]] = None,
+        team_act: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
        actor_mem, critic_mem = self._get_actor_critic_mem(memories)
        encoding, actor_mem_outs = self.network_body(
-        all_net_inputs = [inputs]
-        if critic_obs is not None and critic_obs:
-            all_net_inputs.extend(critic_obs)
-            critic_obs = []
-        mar_value_outputs, _ = self.critic(
-            all_net_inputs, [], [], memories=critic_mem, sequence_length=sequence_length
-        )
-        value_outputs, critic_mem_outs = self.critic(
-            [inputs],
-            critic_obs,
+        value_outputs, mar_value_outputs, _ = self.critic_pass(
+            inputs,
+            team_obs=team_obs,
+            team_act=team_act,
        )

        return log_probs, entropies, value_outputs, mar_value_outputs