Running COMA (not sure if learning)

4 年前 · 9bc88c41
--- a/ml-agents/mlagents/trainers/buffer.py
+++ b/ml-agents/mlagents/trainers/buffer.py
    VALUE_ESTIMATES = "value_estimates"
    RETURNS = "returns"
    ADVANTAGE = "advantage"
+    BASELINES = "baselines"


 AgentBufferKey = Union[
    @staticmethod
    def advantage_key(name: str) -> AgentBufferKey:
        return RewardSignalKeyPrefix.ADVANTAGE, name
+
+    @staticmethod
+    def baseline_estimates_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.BASELINES, name


 class AgentBufferField(list):
--- a/ml-agents/mlagents/trainers/coma/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/coma/optimizer_torch.py
-from typing import Dict, cast
+from typing import Dict, cast, List, Tuple, Optional
+import numpy as np
+from mlagents_envs.base_env import ObservationSpec, ActionSpec
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
 from mlagents.trainers.settings import TrainerSettings, PPOSettings
 from mlagents.trainers.torch.action_log_probs import ActionLogProbs
 from mlagents.trainers.torch.utils import ModelUtils
-from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil
+from mlagents.trainers.settings import NetworkSettings


 class TorchCOMAOptimizer(TorchOptimizer):
            else:
                encoding_size = network_settings.hidden_units

-            self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
+            self.value_heads = ValueHeads(stream_names, encoding_size, 1)
-    @property
-    def memory_size(self) -> int:
-        return self.network_body.memory_size
+        @property
+        def memory_size(self) -> int:
+            return self.network_body.memory_size
-    def update_normalization(self, buffer: AgentBuffer) -> None:
-        self.network_body.update_normalization(buffer)
-    
-    def baseline(
-        self,
-        self_obs: List[List[torch.Tensor]],
-        obs: List[List[torch.Tensor]],
-        actions: List[AgentAction],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def update_normalization(self, buffer: AgentBuffer) -> None:
+            self.network_body.update_normalization(buffer)
+
+        def baseline(
+            self,
+            self_obs: List[List[torch.Tensor]],
+            obs: List[List[torch.Tensor]],
+            actions: List[AgentAction],
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
-        encoding, memories = self.network_body(obs_only=self_obs, obs=obs, actions=actions, memories, sequence_length)
-        value_outputs, critic_mem_out = self.forward(encoding, memories, sequence_length)
-        return value_outputs, critic_mem_out
+            encoding, memories = self.network_body(
+                obs_only=self_obs,
+                obs=obs,
+                actions=actions,
+                memories=memories,
+                sequence_length=sequence_length,
+            )
+            value_outputs, critic_mem_out = self.forward(
+                encoding, memories, sequence_length
+            )
+            return value_outputs, critic_mem_out
-  
-    def critic_pass(
-        self,
-        obs: List[List[torch.Tensor]],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def critic_pass(
+            self,
+            obs: List[List[torch.Tensor]],
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
-        encoding, memories = self.network_body(obs_only=obs, obs=None, actions=None, memories, sequence_length)
-        value_outputs, critic_mem_out = self.forward(encoding, memories, sequence_length)
-        return value_outputs, critic_mem_out
+            encoding, memories = self.network_body(
+                obs_only=obs,
+                obs=None,
+                actions=None,
+                memories=memories,
+                sequence_length=sequence_length,
+            )
+            value_outputs, critic_mem_out = self.forward(
+                encoding, memories, sequence_length
+            )
+            return value_outputs, critic_mem_out
-    def forward(
-        self,
-        encoding: torch.Tensor,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def forward(
+            self,
+            encoding: torch.Tensor,
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
-        output = self.value_heads(encoding)
-        return output, memories
+            output = self.value_heads(encoding)
+            return output, memories

    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
        """
        reward_signal_configs = trainer_settings.reward_signals
        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]

-        self._critic = COMAValueNetwork(
+        self._critic = TorchCOMAOptimizer.COMAValueNetwork(
-            policy.behavior_spec.action_spec,
+            action_spec=policy.behavior_spec.action_spec,
-        params = list(self.policy.actor.parameters()) + list(
-            self.value_net.parameters()
-        )
+        params = list(self.policy.actor.parameters()) + list(self.critic.parameters())
        self.hyperparameters: PPOSettings = cast(
            PPOSettings, trainer_settings.hyperparameters
        )
        value_loss = torch.mean(torch.stack(value_losses))
        return value_loss

-    def policy_policy_loss(
+    def ppo_policy_loss(
        self,
        advantages: torch.Tensor,
        log_probs: torch.Tensor,
        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
        returns = {}
        old_values = {}
+        old_baseline_values = {}
        for name in self.reward_signals:
            old_values[name] = ModelUtils.list_to_tensor(
                batch[RewardSignalUtil.value_estimates_key(name)]
+            )
+            old_baseline_values[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.baseline_estimates_key(name)]
            )

        n_obs = len(self.policy.behavior_spec.observation_specs)
+        group_obs = GroupObsUtil.from_buffer(batch, n_obs)
+        group_obs = [
+            [ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs]
+            for _groupmate_obs in group_obs
+        ]
+        group_actions = AgentAction.group_from_buffer(batch)

        memories = [
            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
+        all_obs = [current_obs] + group_obs
-            current_obs, memories=memories, sequence_length=self.policy.sequence_length
+            all_obs, memories=memories, sequence_length=self.policy.sequence_length
+        )
+        baselines, _ = self.critic.baseline(
+            [current_obs],
+            group_obs,
+            group_actions,
+            memories=memories,
+            sequence_length=self.policy.sequence_length,
-        value_loss = self.ppo_value_loss(
+
+        baseline_loss = self.coma_value_loss(
+            baselines, old_baseline_values, returns, decay_eps, loss_masks
+        )
+        value_loss = self.coma_value_loss(
            values, old_values, returns, decay_eps, loss_masks
        )
        policy_loss = self.ppo_policy_loss(
        )
        loss = (
            policy_loss
-            + 0.5 * value_loss
+            + 0.5 * (value_loss + baseline_loss)
            - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
        )

        for reward_provider in self.reward_signals.values():
            modules.update(reward_provider.get_modules())
        return modules
+
+    def get_trajectory_value_estimates(
+        self,
+        batch: AgentBuffer,
+        next_obs: List[np.ndarray],
+        next_group_obs: List[List[np.ndarray]],
+        done: bool,
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+
+        current_obs = ObsUtil.from_buffer(batch, n_obs)
+        team_obs = GroupObsUtil.from_buffer(batch, n_obs)
+
+        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+        team_obs = [
+            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
+            for _teammate_obs in team_obs
+        ]
+
+        team_actions = AgentAction.group_from_buffer(batch)
+
+        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
+        next_obs = [obs.unsqueeze(0) for obs in next_obs]
+
+        next_group_obs = [
+            ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_group_obs
+        ]
+        # Expand dimensions of next critic obs
+        next_group_obs = [
+            [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_group_obs
+        ]
+
+        memory = torch.zeros([1, 1, self.policy.m_size])
+        all_obs = [current_obs] + team_obs if team_obs is not None else [current_obs]
+        value_estimates, mem = self.critic.critic_pass(
+            all_obs, memory, sequence_length=batch.num_experiences
+        )
+
+        baseline_estimates, mem = self.critic.baseline(
+            [current_obs],
+            team_obs,
+            team_actions,
+            memory,
+            sequence_length=batch.num_experiences,
+        )
+        all_next_obs = (
+            [next_obs] + next_group_obs if next_group_obs is not None else [next_obs]
+        )
+
+        next_value_estimates, mem = self.critic.critic_pass(
+            all_next_obs, mem, sequence_length=batch.num_experiences
+        )
+
+        for name, estimate in baseline_estimates.items():
+            baseline_estimates[name] = ModelUtils.to_numpy(estimate)
+
+        for name, estimate in value_estimates.items():
+            value_estimates[name] = ModelUtils.to_numpy(estimate)
+
+        # the base line and V shpuld  not be on the same done flag
+        for name, estimate in next_value_estimates.items():
+            next_value_estimates[name] = ModelUtils.to_numpy(estimate)
+
+        if done:
+            for k in next_value_estimates:
+                if not self.reward_signals[k].ignore_done:
+                    next_value_estimates[k][-1] = 0.0
+
+        return (value_estimates, baseline_estimates, next_value_estimates)
--- a/ml-agents/mlagents/trainers/coma/trainer.py
+++ b/ml-agents/mlagents/trainers/coma/trainer.py
            self.policy.update_normalization(agent_buffer_trajectory)

        # Get all value estimates
-        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
+        (
+            value_estimates,
+            baseline_estimates,
+            value_next,
+        ) = self.optimizer.get_trajectory_value_estimates(
-            trajectory.done_reached and not trajectory.interrupted,
+            trajectory.next_group_obs,
+            trajectory.teammate_dones_reached
+            and trajectory.done_reached
+            and not trajectory.interrupted,
        )

        for name, v in value_estimates.items():
+            agent_buffer_trajectory[
+                RewardSignalUtil.baseline_estimates_key(name)
+            ].extend(baseline_estimates[name])
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
+                np.mean(baseline_estimates[name]),
+            )
-                np.mean(v),
+                np.mean(value_estimates[name]),
-        # Evaluate all reward functions
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
        )
        tmp_advantages = []
        tmp_returns = []
        for name in self.optimizer.reward_signals:
-            bootstrap_value = value_next[name]
-            local_value_estimates = agent_buffer_trajectory[
+            baseline_estimates = agent_buffer_trajectory[
+                RewardSignalUtil.baseline_estimates_key(name)
+            ].get_batch()
+            v_estimates = agent_buffer_trajectory[
-            local_advantage = get_gae(
+            returns_v, returns_b = get_team_returns(
-                value_estimates=local_value_estimates,
-                value_next=bootstrap_value,
+                baseline_estimates=baseline_estimates,
+                v_estimates=v_estimates,
+                value_next=value_next[name],
-            local_return = local_advantage + local_value_estimates
-            # This is later use as target for the different value estimates
-            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
-                local_return
+            test_v, _ = get_team_returns(
+                rewards=local_rewards,
+                baseline_estimates=baseline_estimates,
+                v_estimates=v_estimates,
+                value_next=value_next[name],
+                gamma=self.optimizer.reward_signals[name].gamma,
+                lambd=1,
+            )
+
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Sum Rewards",
+                np.mean(test_v),
+            )
+
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
+                np.mean(returns_v),
+            )
+
+            local_advantage = np.array(returns_v) - np.array(baseline_estimates)
+
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",
+                np.mean(local_advantage),
+
+            local_return = local_advantage + baseline_estimates
+
+            # local_return = local_advantage + q_estimates
+            # This is later use as target for the different value estimates
+            # agent_buffer_trajectory[f"{name}_returns"].set(local_return)
+            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(returns_v)
            agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
                local_advantage
            )
        )
        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
+
        agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
        return policy

    def create_coma_optimizer(self) -> TorchCOMAOptimizer:
-        return TorchCOMAptimizer(  # type: ignore
+        return TorchCOMAOptimizer(  # type: ignore
            cast(TorchPolicy, self.policy), self.trainer_settings  # type: ignore
        )  # type: ignore

        """
        self.policy = policy
        self.policies[parsed_behavior_id.behavior_id] = policy
-        self.optimizer = self.create_ppo_optimizer()
+        self.optimizer = self.create_coma_optimizer()
        for _reward_signal in self.optimizer.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

            + (1 - lambd) * gamma * value_estimates[t + 1]
        )
    return returns
+
+
+def get_team_returns(
+    rewards,
+    baseline_estimates,
+    v_estimates,
+    value_next=0.0,
+    died=False,
+    gamma=0.99,
+    lambd=0.8,
+):
+    """
+    Computes generalized advantage estimate for use in updating policy.
+    :param rewards: list of rewards for time-steps t to T.
+    :param value_next: Value estimate for time-step T+1.
+    :param value_estimates: list of value estimates for time-steps t to T.
+    :param gamma: Discount factor.
+    :param lambd: GAE weighing factor.
+    :return: list of advantage estimates for time-steps t to T.
+    """
+    rewards = np.array(rewards)
+    returns_b = lambda_return(
+        rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
+    )
+    returns_v = lambda_return(
+        rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
+    )
+
+    return returns_v, returns_b
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
 from mlagents.trainers.settings import TrainerSettings, PPOSettings
-from mlagents.trainers.torch.networks import ValueNetwork
 from mlagents.trainers.torch.agent_action import AgentAction
 from mlagents.trainers.torch.action_log_probs import ActionLogProbs
 from mlagents.trainers.torch.utils import ModelUtils
        # Create the graph here to give more granular control of the TF graph to the Optimizer.

        super().__init__(policy, trainer_settings)
-        reward_signal_configs = trainer_settings.reward_signals
-        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
-
-        if policy.shared_critic:
-            self.value_net = policy.actor
-        else:
-            self.value_net = ValueNetwork(
-                reward_signal_names,
-                policy.behavior_spec.observation_specs,
-                network_settings=trainer_settings.network_settings,
-            )
-
-        params = list(self.policy.actor.parameters()) + list(
-            self.value_net.parameters()
-        )
+        params = list(self.policy.actor_critic.parameters())
        self.hyperparameters: PPOSettings = cast(
            PPOSettings, trainer_settings.hyperparameters
        )
        }

        self.stream_names = list(self.reward_signals.keys())
-
-    @property
-    def critic(self):
-        return self.value_net

    def ppo_value_loss(
        self,
        if len(memories) > 0:
            memories = torch.stack(memories).unsqueeze(0)

-        log_probs, entropy = self.policy.evaluate_actions(
+        log_probs, entropy, values = self.policy.evaluate_actions(
-        )
-        values, _ = self.critic.critic_pass(
-            current_obs, memories=memories, sequence_length=self.policy.sequence_length
        )
        old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
        log_probs = log_probs.flatten()
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        return self.policy


-def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
+def discount_rewards(r, gamma=0.99, value_next=0.0):
-    Computes lambda return.
+    Computes discounted sum of future rewards for use in updating value estimate.
-    :param value_estimates: List of value estimates.
-    :param lambd: n_step return weighting factor.
-    :return: lambda return as a list
+    :return: discounted sum of future rewards as list.
+    discounted_r = np.zeros_like(r)
+    running_add = value_next
+    for t in reversed(range(0, r.size)):
+        running_add = running_add * gamma + r[t]
+        discounted_r[t] = running_add
+    return discounted_r
+
+
+def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
    returns = np.zeros_like(r)
    returns[-1] = r[-1] + gamma * value_next
    for t in reversed(range(0, r.size - 1)):
        )

    return returns
+
+
+def get_team_returns(
+    rewards,
+    baseline_estimates,
+    v_estimates,
+    value_next=0.0,
+    died=False,
+    gamma=0.99,
+    lambd=0.8,
+):
+    """
+    Computes generalized advantage estimate for use in updating policy.
+    :param rewards: list of rewards for time-steps t to T.
+    :param value_next: Value estimate for time-step T+1.
+    :param value_estimates: list of value estimates for time-steps t to T.
+    :param gamma: Discount factor.
+    :param lambd: GAE weighing factor.
+    :return: list of advantage estimates for time-steps t to T.
+    """
+    rewards = np.array(rewards)
+    returns_b = lambd_return(
+        rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
+    )
+    returns_v = lambd_return(
+        rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
+    )
+
+    return returns_v, returns_b
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
 class TrainerType(Enum):
    PPO: str = "ppo"
    SAC: str = "sac"
+    COMA: str = "coma"
-        _mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
+        _mapping = {
+            TrainerType.PPO: PPOSettings,
+            TrainerType.SAC: SACSettings,
+            TrainerType.COMA: PPOSettings,
+        }
        return _mapping[self]


--- a/ml-agents/mlagents/trainers/torch/agent_action.py
+++ b/ml-agents/mlagents/trainers/torch/agent_action.py
 from typing import List, Optional, NamedTuple
+import itertools
+import numpy as np
-from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents.trainers.buffer import AgentBuffer, BufferKey, AgentBufferField
 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents_envs.base_env import ActionTuple

                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
            ]
        return AgentAction(continuous, discrete)
+
+    @staticmethod
+    def _padded_time_to_batch(
+        agent_buffer_field: AgentBufferField, dtype: torch.dtype = torch.float32
+    ) -> List[torch.Tensor]:
+        """
+        Pad actions and convert to tensor. Note that data is padded by 0's, not NaNs
+        as the observations are.
+        """
+        action_shape = None
+        for _action in agent_buffer_field:
+            if _action:
+                action_shape = _action[0].shape
+                break
+        # If there were no critic obs at all
+        if action_shape is None:
+            return []
+
+        new_list = list(
+            map(
+                lambda x: ModelUtils.list_to_tensor(x, dtype=dtype),
+                itertools.zip_longest(
+                    *agent_buffer_field, fillvalue=np.full(action_shape, 0)
+                ),
+            )
+        )
+        return new_list
+
+    @staticmethod
+    def _group_from_buffer(
+        buff: AgentBuffer, cont_action_key: BufferKey, disc_action_key: BufferKey
+    ) -> List["AgentAction"]:
+        continuous_tensors: List[torch.Tensor] = []
+        discrete_tensors: List[torch.Tensor] = []  # type: ignore
+        if cont_action_key in buff:
+            continuous_tensors = AgentAction._padded_time_to_batch(
+                buff[cont_action_key]
+            )
+        if disc_action_key in buff:
+            discrete_tensors = AgentAction._padded_time_to_batch(
+                buff[disc_action_key], dtype=torch.long
+            )
+
+        actions_list = []
+        for _cont, _disc in itertools.zip_longest(
+            continuous_tensors, discrete_tensors, fillvalue=None
+        ):
+            if _disc is not None:
+                _disc = [_disc[..., i] for i in range(_disc.shape[-1])]
+            actions_list.append(AgentAction(_cont, _disc))
+        return actions_list
+
+    @staticmethod
+    def group_from_buffer(buff: AgentBuffer) -> List["AgentAction"]:
+        """
+        A static method that accesses continuous and discrete action fields in an AgentBuffer
+        and constructs the corresponding AgentAction from the retrieved np arrays.
+        """
+        return AgentAction._group_from_buffer(
+            buff, BufferKey.GROUP_CONTINUOUS_ACTION, BufferKey.GROUP_DISCRETE_ACTION
+        )
+
+    @staticmethod
+    def group_from_buffer_next(buff: AgentBuffer) -> List["AgentAction"]:
+        """
+        A static method that accesses next continuous and discrete action fields in an AgentBuffer
+        and constructs the corresponding AgentAction from the retrieved np arrays.
+        """
+        return AgentAction._group_from_buffer(
+            buff, BufferKey.GROUP_NEXT_CONT_ACTION, BufferKey.GROUP_NEXT_DISC_ACTION
+        )
+
+    def to_flat(self, discrete_branches: List[int]) -> torch.Tensor:
+        discrete_oh = ModelUtils.actions_to_onehot(
+            self.discrete_tensor, discrete_branches
+        )
+        discrete_oh = torch.cat(discrete_oh, dim=1)
+        return torch.cat([self.continuous_tensor, discrete_oh], dim=-1)
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
        return encoding, memories


-class MultiInputNetworkBody(torch.nn.Module, Critic):
+class MultiInputNetworkBody(torch.nn.Module):
    def __init__(
        self,
        observation_specs: List[ObservationSpec],
            else 0
        )
        self.processors, _input_size = ModelUtils.create_input_processors(
-            sensor_specs,
+            observation_specs,
            self.h_size,
            network_settings.vis_encode_type,
            normalize=self.normalize,
            + sum(self.action_spec.discrete_branches)
            + self.action_spec.continuous_size
        )
-        self.obs_encoder = EntityEmbedding(
-            0, obs_only_ent_size, None, self.h_size, concat_self=False
-        )
-        self.obs_action_encoder = EntityEmbedding(
-            0, q_ent_size, None, self.h_size, concat_self=False
-        )
+        self.obs_encoder = EntityEmbedding(obs_only_ent_size, None, self.h_size)
+        self.obs_action_encoder = EntityEmbedding(q_ent_size, None, self.h_size)

        self.self_attn = ResidualSelfAttention(self.h_size)

        if self.use_lstm:
            self.lstm = LSTM(self.h_size, self.m_size)
        else:
-            self.lstm = None  # type: ignorek
+            self.lstm = None  # type: ignore
+    @property
+    def memory_size(self) -> int:
+        return self.lstm.memory_size if self.use_lstm else 0
-@property
-def memory_size(self) -> int:
-    return self.lstm.memory_size if self.use_lstm else 0
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        obs = ObsUtil.from_buffer(buffer, len(self.processors))
+        for vec_input, enc in zip(obs, self.processors):
+            if isinstance(enc, VectorInput):
+                enc.update_normalization(torch.as_tensor(vec_input))
+    def copy_normalization(self, other_network: "MultiInputNetworkBody") -> None:
+        if self.normalize:
+            for n1, n2 in zip(self.processors, other_network.processors):
+                if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
+                    n1.copy_normalization(n2)
-def update_normalization(self, buffer: AgentBuffer) -> None:
-    obs = ObsUtil.from_buffer(buffer, len(self.processors))
-    for vec_input, enc in zip(obs, self.processors):
-        if isinstance(enc, VectorInput):
-            enc.update_normalization(torch.as_tensor(vec_input))
+    def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Get attention masks by grabbing an arbitrary obs across all the agents
+        Since these are raw obs, the padded values are still NaN
+        """
+        only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
+        obs_for_mask = torch.stack(only_first_obs, dim=1)
+        # Get the mask from nans
+        attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
+        return attn_mask
-
-def copy_normalization(self, other_network: "MultiInputNetworkBody") -> None:
-    if self.normalize:
-        for n1, n2 in zip(self.processors, other_network.processors):
-            if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
-                n1.copy_normalization(n2)
-
-
-def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
-    """
-    Get attention masks by grabbing an arbitrary obs across all the agents
-    Since these are raw obs, the padded values are still NaN
-    """
-    only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
-    obs_for_mask = torch.stack(only_first_obs, dim=1)
-    # Get the mask from nans
-    attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
-    return attn_mask
-
-
-def forward(
-    self,
-    obs_only: List[List[torch.Tensor]],
-    obs: List[List[torch.Tensor]],
-    actions: List[AgentAction],
-    memories: Optional[torch.Tensor] = None,
-    sequence_length: int = 1,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self,
+        obs_only: List[List[torch.Tensor]],
+        obs: List[List[torch.Tensor]],
+        actions: Optional[List[AgentAction]],
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
-    self_attn_masks = []
-    self_attn_inputs = []
-    concat_f_inp = []
-    for inputs, action in zip(obs, actions):
-        encodes = []
-        for idx, processor in enumerate(self.processors):
-            obs_input = inputs[idx]
-            obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
-            processed_obs = processor(obs_input)
-            encodes.append(processed_obs)
-        cat_encodes = [
-            torch.cat(encodes, dim=-1),
-            action.to_flat(self.action_spec.discrete_branches),
-        ]
-        concat_f_inp.append(torch.cat(cat_encodes, dim=1))
+        self_attn_masks = []
+        self_attn_inputs = []
+        concat_f_inp = []
+        if actions is not None:
+            for inputs, action in zip(obs, actions):
+                encodes = []
+                for idx, processor in enumerate(self.processors):
+                    obs_input = inputs[idx]
+                    obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
+                    processed_obs = processor(obs_input)
+                    encodes.append(processed_obs)
+                cat_encodes = [
+                    torch.cat(encodes, dim=-1),
+                    action.to_flat(self.action_spec.discrete_branches),
+                ]
+                concat_f_inp.append(torch.cat(cat_encodes, dim=1))
-    if concat_f_inp:
-        f_inp = torch.stack(concat_f_inp, dim=1)
-        self_attn_masks.append(self._get_masks_from_nans(obs))
-        self_attn_inputs.append(self.obs_action_encoder(None, f_inp))
+        if concat_f_inp:
+            f_inp = torch.stack(concat_f_inp, dim=1)
+            self_attn_masks.append(self._get_masks_from_nans(obs))
+            self_attn_inputs.append(self.obs_action_encoder(None, f_inp))
-    concat_encoded_obs = []
-    for inputs in obs_only:
-        encodes = []
-        for idx, processor in enumerate(self.processors):
-            obs_input = inputs[idx]
-            obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
-            processed_obs = processor(obs_input)
-            encodes.append(processed_obs)
-        concat_encoded_obs.append(torch.cat(encodes, dim=-1))
-    g_inp = torch.stack(concat_encoded_obs, dim=1)
-    self_attn_masks.append(self._get_masks_from_nans())
-    self_attn_inputs.append(self.obs_encoder(None, g_inp))
+        concat_encoded_obs = []
+        for inputs in obs_only:
+            encodes = []
+            for idx, processor in enumerate(self.processors):
+                obs_input = inputs[idx]
+                obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
+                processed_obs = processor(obs_input)
+                encodes.append(processed_obs)
+            concat_encoded_obs.append(torch.cat(encodes, dim=-1))
+        g_inp = torch.stack(concat_encoded_obs, dim=1)
+        self_attn_masks.append(self._get_masks_from_nans(obs_only))
+        self_attn_inputs.append(self.obs_encoder(None, g_inp))
-    encoded_entity = torch.cat(self_attn_inputs, dim=1)
-    encoded_state = self.self_attn(encoded_entity, self_attn_masks)
+        encoded_entity = torch.cat(self_attn_inputs, dim=1)
+        encoded_state = self.self_attn(encoded_entity, self_attn_masks)
-    encoding = self.linear_encoder(encoded_state)
-    if self.use_lstm:
-        # Resize to (batch, sequence length, encoding size)
-        encoding = encoding.reshape([-1, sequence_length, self.h_size])
-        encoding, memories = self.lstm(encoding, memories)
-        encoding = encoding.reshape([-1, self.m_size // 2])
-    return encoding, memories
+        encoding = self.linear_encoder(encoded_state)
+        if self.use_lstm:
+            # Resize to (batch, sequence length, encoding size)
+            encoding = encoding.reshape([-1, sequence_length, self.h_size])
+            encoding, memories = self.lstm(encoding, memories)
+            encoding = encoding.reshape([-1, self.m_size // 2])
+        return encoding, memories


 class Critic(abc.ABC):
--- a/ml-agents/mlagents/trainers/trainer/trainer_factory.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer_factory.py
 import os
 from typing import Dict
+from mlagents.trainers.coma.trainer import COMATrainer

 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager

        if trainer_type == TrainerType.PPO:
            trainer = PPOTrainer(
+                brain_name,
+                min_lesson_length,
+                trainer_settings,
+                train_model,
+                load_model,
+                seed,
+                trainer_artifact_path,
+            )
+        elif trainer_type == TrainerType.COMA:
+            trainer = COMATrainer(
                brain_name,
                min_lesson_length,
                trainer_settings,