New buffer layout, TeamObsUtil, pad dead agents

4 年前 · 9c3da1b6
--- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
 import numpy as np

 from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
 from mlagents.trainers.torch.components.bc.module import BCModule
 from mlagents.trainers.torch.components.reward_providers import create_reward_provider


        next_obs = [obs.unsqueeze(0) for obs in next_obs]

-        critic_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["critic_obs"])
+        critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
-            ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
+            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
+            for _teammate_obs in critic_obs
-
        next_critic_obs = [
            ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
        ]
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from mlagents.trainers.torch.agent_action import AgentAction
 from mlagents.trainers.torch.action_log_probs import ActionLogProbs
 from mlagents.trainers.torch.utils import ModelUtils
-from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil


 class TorchPPOOptimizer(TorchOptimizer):
        current_obs = ObsUtil.from_buffer(batch, n_obs)
        # Convert to tensors
        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
-        critic_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["critic_obs"])
+
+        critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
-            ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
+            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
+            for _teammate_obs in critic_obs
        ]

        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
            self_encodes.append(processed_obs)
        x_self = torch.cat(self_encodes, dim=-1)

+        # Get attention masks by grabbing an arbitrary obs across all the agents
+        # Since these are raw obs, the padded values are still 0
+        only_first_obs = [_all_obs[0] for _all_obs in all_net_inputs]
+        obs_for_mask = torch.stack(only_first_obs, dim=1)
+
        # Get the self encoding separately, but keep it in the entities
        concat_encoded_obs = [x_self]
        for inputs in all_net_inputs[1:]:

        encoded_entity = self.entity_encoder(x_self, [concat_entites])
        encoded_state = self.self_attn(
-            encoded_entity, EntityEmbeddings.get_masks([concat_entites])
+            encoded_entity, EntityEmbeddings.get_masks([obs_for_mask])
        )

        if len(concat_encoded_obs) == 0:
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
 from typing import List, NamedTuple
+import itertools
 import attr
 import numpy as np

        return result


+class TeamObsUtil:
+    @staticmethod
+    def get_name_at(index: int) -> str:
+        """
+        returns the name of the observation given the index of the observation
+        """
+        return f"team_obs_{index}"
+
+    @staticmethod
+    def _padded_time_to_batch(
+        agent_buffer_field: AgentBuffer.AgentBufferField,
+    ) -> List[np.ndarray]:
+        """
+        Convert an AgentBufferField of List of obs, where one of the dimension is time and the other is number (e.g.
+        in the case of a variable number of critic observations) to a List of obs, where time is in the batch dimension
+        of the obs, and the List is the variable number of agents. For cases where there are varying number of agents,
+        pad the non-existent agents with 0.
+        """
+        # Find the first observation. This should be USUALLY O(1)
+        obs_shape = None
+        for _team_obs in agent_buffer_field:
+            if _team_obs:
+                obs_shape = _team_obs[0].shape
+                break
+        # If there were no critic obs at all
+        if obs_shape is None:
+            return []
+
+        new_list = list(
+            map(
+                lambda x: np.asanyarray(x),
+                itertools.zip_longest(
+                    *agent_buffer_field, fillvalue=np.zeros(obs_shape)
+                ),
+            )
+        )
+
+        return new_list
+
+    @staticmethod
+    def _transpose_list_of_lists(
+        list_list: List[List[np.ndarray]],
+    ) -> List[List[np.ndarray]]:
+        return list(map(list, zip(*list_list)))
+
+    @staticmethod
+    def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]:
+        """
+        Creates the list of observations from an AgentBuffer
+        """
+        separated_obs: List[np.array] = []
+        for i in range(num_obs):
+            separated_obs.append(
+                TeamObsUtil._padded_time_to_batch(batch[TeamObsUtil.get_name_at(i)])
+            )
+        # separated_obs contains a List(num_obs) of Lists(num_agents), we want to flip
+        # that and get a List(num_agents) of Lists(num_obs)
+        result = TeamObsUtil._transpose_list_of_lists(separated_obs)
+        return result
+
+
 class Trajectory(NamedTuple):
    steps: List[AgentExperience]
    next_obs: List[
            for i in range(num_obs):
                agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])
-            agent_buffer_trajectory["critic_obs"].append(exp.collab_obs)
+
+            for i in range(num_obs):
+                ith_team_obs = []
+                for _team_obs in exp.collab_obs:
+                    # Assume teammates have same obs space
+                    ith_team_obs.append(_team_obs[i])
+                agent_buffer_trajectory[TeamObsUtil.get_name_at(i)].append(ith_team_obs)

            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)