Back out trainer changes

4 年前 · 0bde7598
--- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
 from typing import Dict, Optional, Tuple, List
 from mlagents.torch_utils import torch
-from mlagents.trainers.torch.agent_action import AgentAction
-from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
+from mlagents.trainers.trajectory import ObsUtil
 from mlagents.trainers.torch.components.bc.module import BCModule
 from mlagents.trainers.torch.components.reward_providers import create_reward_provider

            )

    def get_trajectory_value_estimates(
-        self,
-        batch: AgentBuffer,
-        next_obs: List[np.ndarray],
-        next_critic_obs: List[List[np.ndarray]],
-        done: bool,
-        all_dones: bool,
+        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
-        team_obs = TeamObsUtil.from_buffer(batch, n_obs)
-        # next_obs = ObsUtil.from_buffer_next(batch, n_obs)
-        # next_team_obs = TeamObsUtil.from_buffer_next(batch, n_obs)
-        team_obs = [
-            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
-            for _teammate_obs in team_obs
-        ]
-        # next_team_obs = [
-        #    [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
-        #    for _teammate_obs in next_team_obs
-        # ]
-
-        actions = AgentAction.from_dict(batch)
-        team_actions = AgentAction.from_team_dict(batch)
-        # next_actions = AgentAction.from_dict_next(batch)
-        # next_team_actions = AgentAction.from_team_dict_next(batch)
-
-        next_obs = [obs.unsqueeze(0) for obs in next_obs]
-
-        # critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
-        # critic_obs = [
-        #    [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
-        #    for _teammate_obs in critic_obs
-        # ]
-        next_critic_obs = [
-            ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
-        ]
-        # Expand dimensions of next critic obs
-        next_critic_obs = [
-            [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
-        ]
-        baseline_estimates, _ = self.policy.actor_critic.critic_pass(
-            current_obs,
-            actions,
-            memory,
-            sequence_length=batch.num_experiences,
-            team_obs=team_obs,
-            team_act=team_actions,
-        )
+        next_obs = [obs.unsqueeze(0) for obs in next_obs]
-        value_estimates, mem = self.policy.actor_critic.target_critic_value(
-            current_obs,
-            memory,
-            sequence_length=batch.num_experiences,
-            team_obs=team_obs,
+        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
+            current_obs, memory, sequence_length=batch.num_experiences
-        boot_value_estimates, mem = self.policy.actor_critic.target_critic_value(
-            next_obs,
-            memory,
-            sequence_length=batch.num_experiences,
-            team_obs=next_critic_obs,
+        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
+            next_obs, next_memory, sequence_length=1
-
-        # next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.target_critic_pass(
-        #    next_obs,
-        #    next_actions,
-        #    memory,
-        #    sequence_length=batch.num_experiences,
-        #    team_obs=next_team_obs,
-        #    team_act=next_team_actions,
-        # )
-
-        # # Actions is a hack here, we need the next actions
-        # next_value_estimate, next_marg_val_estimate, _ = self.policy.actor_critic.critic_pass(
-        #     next_obs, actions, next_memory, sequence_length=1, critic_obs=next_critic_obs
-        # )
-        # These aren't used in COMAttention
-
-        for name, estimate in baseline_estimates.items():
-            baseline_estimates[name] = ModelUtils.to_numpy(estimate)
+            next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])
-        # the base line and V shpuld  not be on the same done flag
-        for name, estimate in boot_value_estimates.items():
-            boot_value_estimates[name] = ModelUtils.to_numpy(estimate)
-
-        if all_dones:
-            for k in boot_value_estimates:
+        if done:
+            for k in next_value_estimate:
-                    boot_value_estimates[k][-1] = 0.0
-        #            else:
-        #                print(len(next_critic_obs))
-        #                print(baseline_estimates)
-        #                print(value_estimates)
-        #                print(boot_value_baseline[k][-1])
-        # if done and not all_dones:
-        #    print("agent finished but team going")
-        # elif all_dones:
-        #    print("alldone")
-        # else:
-        #    print("neither")
-        # print("final", boot_value_estimates)
-        # print("value", value_estimates)
-        # print("base", baseline_estimates)
+                    next_value_estimate[k] = 0.0
-        return (value_estimates, baseline_estimates, boot_value_estimates)
+        return value_estimates, next_value_estimate
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
        """
        :param obs: List of observations.
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
-        team_obs: Optional[List[List[torch.Tensor]]] = None,
-        team_act: Optional[List[AgentAction]] = None,
-        log_probs, entropies, baseline, values = self.actor_critic.get_stats_and_value(
-            obs, actions, masks, memories, seq_len, team_obs, team_act
+        log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
+            obs, actions, masks, memories, seq_len
-        return log_probs, entropies, baseline, values
+        return log_probs, entropies, value_heads

    @timed
    def evaluate(
        memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
            0
        )
+
        run_out = {}
        with torch.no_grad():
            action, log_probs, entropy, memories = self.sample_actions(
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from mlagents.trainers.torch.agent_action import AgentAction
 from mlagents.trainers.torch.action_log_probs import ActionLogProbs
 from mlagents.trainers.torch.utils import ModelUtils
-from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
+from mlagents.trainers.trajectory import ObsUtil


 class TorchPPOOptimizer(TorchOptimizer):
        )

        self.optimizer = torch.optim.Adam(
-            params,
-            lr=self.trainer_settings.hyperparameters.learning_rate,
-            weight_decay=1e-6,
+            params, lr=self.trainer_settings.hyperparameters.learning_rate
        )
        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
        self.stream_names = list(self.reward_signals.keys())

-        # ModelUtils.soft_update(
-        #    self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
-        # )
-
    def ppo_value_loss(
        self,
        values: Dict[str, torch.Tensor],
        value_loss = torch.mean(torch.stack(value_losses))
        return value_loss

-    def coma_regularizer_loss(
-        self, values: Dict[str, torch.Tensor], baseline_values: Dict[str, torch.Tensor]
-    ):
-        reg_losses = []
-        for name, head in values.items():
-            reg_loss = torch.nn.functional.mse_loss(head, baseline_values[name])
-            reg_losses.append(reg_loss)
-        value_loss = torch.mean(torch.stack(reg_losses))
-        return value_loss
-
    def ppo_policy_loss(
        self,
        advantages: torch.Tensor,
        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
        decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
-        # returns_b = {}
-        returns_v = {}
+        returns = {}
-        old_marg_values = {}
-            old_marg_values[name] = ModelUtils.list_to_tensor(
-                batch[f"{name}_baseline_estimates"]
-            )
-            returns_v[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_v"])
-            # returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
-        #
+            returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])

        n_obs = len(self.policy.behavior_spec.observation_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)
-        team_obs = TeamObsUtil.from_buffer(batch, n_obs)
-        team_obs = [
-            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
-            for _teammate_obs in team_obs
-        ]
-
-        team_actions = AgentAction.from_team_dict(batch)
-        # next_team_actions = AgentAction.from_team_dict_next(batch)

        memories = [
            ModelUtils.list_to_tensor(batch["memory"][i])
            memories = torch.stack(memories).unsqueeze(0)

-        log_probs, entropy, baseline_vals, values = self.policy.evaluate_actions(
+        log_probs, entropy, values = self.policy.evaluate_actions(
-            team_obs=team_obs,
-            team_act=team_actions,
-        # q_loss = self.ppo_value_loss(qs, old_values, returns_q, decay_eps, loss_masks)
-
-        # Use trust region from value, not baseline
-        baseline_loss = self.ppo_value_loss(
-            baseline_vals, old_marg_values, returns_v, decay_eps, loss_masks
-        )
-            values, old_values, returns_v, decay_eps, loss_masks
+            values, old_values, returns, decay_eps, loss_masks
-
-        # Regularizer loss reduces bias between the baseline and values. Other
-        # regularizers are possible here.
-        # regularizer_loss = self.coma_regularizer_loss(values, baseline_vals)
-
        policy_loss = self.ppo_policy_loss(
            ModelUtils.list_to_tensor(batch["advantages"]),
            log_probs,
        loss = (
            policy_loss
-            + 0.5 * (value_loss + 0.5 * baseline_loss)
-            # + 0.25 * regularizer_loss
+            + 0.5 * value_loss
            - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
        )

        loss.backward()

        self.optimizer.step()
-
-        # ModelUtils.soft_update(
-        #    self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
-        # )
-            # "Losses/Q Loss": q_loss.item(),
-            "Losses/Baseline Value Loss": baseline_loss.item(),
-            # "Losses/Regularization Loss": regularizer_loss.item(),
            "Policy/Learning Rate": decay_lr,
            "Policy/Epsilon": decay_eps,
            "Policy/Beta": decay_bet,
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            self.policy.update_normalization(agent_buffer_trajectory)

        # Get all value estimates
-        (
-            value_estimates,
-            baseline_estimates,
-            value_next,
-        ) = self.optimizer.get_trajectory_value_estimates(
+        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
-            trajectory.next_collab_obs,
-            trajectory.teammate_dones_reached
-            and trajectory.done_reached
-            and not trajectory.interrupted,
-            agent_buffer_trajectory[f"{name}_baseline_estimates"].extend(
-                baseline_estimates[name]
-            )
-            self._stats_reporter.add_stat(
-                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
-                np.mean(baseline_estimates[name]),
-            )
-                np.mean(value_estimates[name]),
+                np.mean(v),
+        # Evaluate all reward functions
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory["environment_rewards"]
        )
        tmp_advantages = []
        tmp_returns = []
        for name in self.optimizer.reward_signals:
+            bootstrap_value = value_next[name]
-            baseline_estimates = agent_buffer_trajectory[
-                f"{name}_baseline_estimates"
+            local_value_estimates = agent_buffer_trajectory[
+                f"{name}_value_estimates"
-            v_estimates = agent_buffer_trajectory[f"{name}_value_estimates"].get_batch()
-            # next_value_estimates = agent_buffer_trajectory[
-            #    f"{name}_value_estimates_next"
-            # ].get_batch()
-            # next_m_value_estimates = agent_buffer_trajectory[
-            #    f"{name}_marginalized_value_estimates_next"
-            # ].get_batch()
-
-            returns_v, returns_b = get_team_returns(
+            local_advantage = get_gae(
-                baseline_estimates=baseline_estimates,
-                v_estimates=v_estimates,
-                value_next=value_next[name],
+                value_estimates=local_value_estimates,
+                value_next=bootstrap_value,
-            # print("loc", local_rewards[-1])
-            # print("tdlam", returns_v)
-
-            # local_advantage = get_team_gae(
-            #    rewards=local_rewards,
-            #    value_estimates=v_estimates,
-            #    baseline=baseline_estimates,
-            #    value_next=value_next[name],
-            #    gamma=self.optimizer.reward_signals[name].gamma,
-            #    lambd=self.hyperparameters.lambd,
-            # )
-            self._stats_reporter.add_stat(
-                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
-                np.mean(returns_v),
-            )
-
-            # local_advantage = np.array(returns_v) - baseline_estimates
-            local_advantage = np.array(returns_v) - np.array(baseline_estimates)
-            # self._stats_reporter.add_stat(
-            #    f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} GAE Advantage Estimate",
-            #    np.mean(gae_advantage),
-            # )
-
-            self._stats_reporter.add_stat(
-                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",
-                np.mean(local_advantage),
-            )
-
-            local_return = local_advantage + baseline_estimates
-
-            # local_return = local_advantage + q_estimates
+            local_return = local_advantage + local_value_estimates
-            # agent_buffer_trajectory[f"{name}_returns"].set(local_return)
-            agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)
-            agent_buffer_trajectory[f"{name}_returns_v"].set(returns_v)
+            agent_buffer_trajectory[f"{name}_returns"].set(local_return)
            agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
            tmp_advantages.append(local_advantage)
            tmp_returns.append(local_return)
        )
        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
        agent_buffer_trajectory["advantages"].set(global_advantages)
-
        agent_buffer_trajectory["discounted_returns"].set(global_returns)
        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
        n_sequences = max(
            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
        )
-        # Normalize advantages
-        advantages = np.array(self.update_buffer["advantages"].get_batch())
+
+        advantages = self.update_buffer["advantages"].get_batch()
-            list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
+            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
        )
        num_epoch = self.hyperparameters.num_epoch
        batch_update_stats = defaultdict(list)
    return discounted_r


-def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
-    returns = np.zeros_like(r)
-    returns[-1] = r[-1] + gamma * value_next
-    for t in reversed(range(0, r.size - 1)):
-        returns[t] = (
-            gamma * lambd * returns[t + 1]
-            + r[t]
-            + (1 - lambd) * gamma * value_estimates[t + 1]
-        )
-
-    return returns
-
-
-def get_team_gae(
-    rewards, value_estimates, baseline, value_next=0.0, gamma=0.99, lambd=0.95
-):
-    """
-    Computes generalized advantage estimate for use in updating policy.
-    :param rewards: list of rewards for time-steps t to T.
-    :param value_next: Value estimate for time-step T+1.
-    :param value_estimates: list of value estimates for time-steps t to T.
-    :param gamma: Discount factor.
-    :param lambd: GAE weighing factor.
-    :return: list of advantage estimates for time-steps t to T.
-    """
-    value_estimates = np.append(value_estimates, value_next)
-    delta_t = rewards + gamma * value_estimates[1:] - baseline
-    advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
-    return advantage
-
-
 def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
    """
    Computes generalized advantage estimate for use in updating policy.
    delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
    advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
    return advantage
-
-
-def get_team_returns(
-    rewards,
-    baseline_estimates,
-    v_estimates,
-    value_next=0.0,
-    died=False,
-    gamma=0.99,
-    lambd=0.8,
-):
-    """
-    Computes generalized advantage estimate for use in updating policy.
-    :param rewards: list of rewards for time-steps t to T.
-    :param value_next: Value estimate for time-step T+1.
-    :param value_estimates: list of value estimates for time-steps t to T.
-    :param gamma: Discount factor.
-    :param lambd: GAE weighing factor.
-    :return: list of advantage estimates for time-steps t to T.
-    """
-    rewards = np.array(rewards)
-    returns_b = lambd_return(
-        rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
-    )
-    returns_v = lambd_return(
-        rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
-    )
-
-    return returns_v, returns_b
--- a/ml-agents/mlagents/trainers/torch/attention.py
+++ b/ml-agents/mlagents/trainers/torch/attention.py


 class MultiHeadAttention(torch.nn.Module):
+    """
+    Multi Head Attention module. We do not use the regular Torch implementation since
+    Barracuda does not support some operators it uses.
+    Takes as input to the forward method 3 tensors:
+     - query: of dimensions (batch_size, number_of_queries, embedding_size)
+     - key: of dimensions (batch_size, number_of_keys, embedding_size)
+     - value: of dimensions (batch_size, number_of_keys, embedding_size)
+    The forward method will return 2 tensors:
+     - The output: (batch_size, number_of_queries, embedding_size)
+     - The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
+    """
-        """
-        Multi Head Attention module. We do not use the regular Torch implementation since
-        Barracuda does not support some operators it uses.
-        Takes as input to the forward method 3 tensors:
-        - query: of dimensions (batch_size, number_of_queries, embedding_size)
-        - key: of dimensions (batch_size, number_of_keys, embedding_size)
-        - value: of dimensions (batch_size, number_of_keys, embedding_size)
-        The forward method will return 2 tensors:
-        - The output: (batch_size, number_of_queries, embedding_size)
-        - The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
-        :param embedding_size: The size of the embeddings that will be generated (should be
-        dividable by the num_heads)
-        :param total_max_elements: The maximum total number of entities that can be passed to
-        the module
-        :param num_heads: The number of heads of the attention module
-        """
        super().__init__()
        self.n_heads = num_heads
        self.head_size: int = embedding_size // self.n_heads
        return value_attention, att


-class EntityEmbedding(torch.nn.Module):
+class EntityEmbeddings(torch.nn.Module):
+    """
+    A module used to embed entities before passing them to a self-attention block.
+    Used in conjunction with ResidualSelfAttention to encode information about a self
+    and additional entities. Can also concatenate self to entities for ego-centric self-
+    attention. Inspired by architecture used in https://arxiv.org/pdf/1909.07528.pdf.
+    """
+
-        entity_size: int,
-        entity_num_max_elements: Optional[int],
+        entity_sizes: List[int],
+        entity_num_max_elements: Optional[List[int]] = None,
-        :param entity_size: Size of other entitiy.
-        :param entity_num_max_elements: Maximum elements for a given entity, None for unrestricted.
+        :param entity_sizes: List of sizes for other entities. Should be of length
+            equivalent to the number of entities.
+        :param embedding_size: Embedding size for entity encoders.
+        :param entity_num_max_elements: Maximum elements in an entity, None for unrestricted.
-        :param embedding_size: Embedding size for the entity encoder.
-        :param concat_self: Whether to concatenate x_self to entities. Set True for ego-centric
+        :param concat_self: Whether to concatenate x_self to entites. Set True for ego-centric
-        self.entity_size: int = entity_size
-        self.entity_num_max_elements: int = -1
+        self.entity_sizes: List[int] = entity_sizes
+        self.entity_num_max_elements: List[int] = [-1] * len(entity_sizes)
        if entity_num_max_elements is not None:
            self.entity_num_max_elements = entity_num_max_elements

        )
        self.embedding_norm = LayerNorm()

-    def forward(self, x_self: torch.Tensor, entities: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, x_self: torch.Tensor, entities: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, int]:
-            num_entities = self.entity_num_max_elements
-            if num_entities < 0:
-                if exporting_to_onnx.is_exporting():
-                    raise UnityTrainerException(
-                        "Trying to export an attention mechanism that doesn't have a set max \
-                        number of elements."
-                    )
-                num_entities = entities.shape[1]
-            expanded_self = x_self.reshape(-1, 1, self.self_size)
-            expanded_self = torch.cat([expanded_self] * num_entities, dim=1)
            # Concatenate all observations with self
            self_and_ent: List[torch.Tensor] = []
            for num_entities, ent in zip(self.entity_num_max_elements, entities):
    def __init__(
        self,
        embedding_size: int,
-        entity_num_max_elements: Optional[int] = None,
+        entity_num_max_elements: Optional[List[int]] = None,
        num_heads: int = 4,
    ):
        """
        super().__init__()
        self.max_num_ent: Optional[int] = None
        if entity_num_max_elements is not None:
-            self.max_num_ent = entity_num_max_elements
+            _entity_num_max_elements = entity_num_max_elements
+            self.max_num_ent = sum(_entity_num_max_elements)

        self.attention = MultiHeadAttention(
            num_heads=num_heads, embedding_size=embedding_size
    def forward(self, inp: torch.Tensor, key_masks: List[torch.Tensor]) -> torch.Tensor:
        # Gather the maximum number of entities information
        mask = torch.cat(key_masks, dim=1)
-
-        inp = self.embedding_norm(inp)
        # Feed to self attention
        query = self.fc_q(inp)  # (b, n_q, emb)
        key = self.fc_k(inp)  # (b, n_k, emb)
        denominator = torch.sum(1 - mask, dim=1, keepdim=True) + self.EPSILON
        output = numerator / denominator
        return output
-
-    @staticmethod
-    def get_masks(observations: List[torch.Tensor]) -> List[torch.Tensor]:
-        """
-        Takes a List of Tensors and returns a List of mask Tensor with 1 if the input was
-        all zeros (on dimension 2) and 0 otherwise. This is used in the Attention
-        layer to mask the padding observations.
-        """
-        with torch.no_grad():
-            # Generate the masking tensors for each entities tensor (mask only if all zeros)
-            key_masks: List[torch.Tensor] = [
-                (torch.sum(ent ** 2, axis=2) < 0.01).type(torch.FloatTensor)
-                for ent in observations
-            ]
-        return key_masks
--- a/ml-agents/mlagents/trainers/torch/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py

 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.demo_loader import demo_to_buffer
-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer


 class BCModule:
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py

 class ExtrinsicRewardProvider(BaseRewardProvider):
    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
-        return np.array(mini_batch["average_team_reward"], dtype=np.float32)
-#        return np.array(mini_batch["environment_rewards"], dtype=np.float32)
+        return np.array(mini_batch["environment_rewards"], dtype=np.float32)

    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
        return {}
--- a/ml-agents/mlagents/trainers/torch/encoders.py
+++ b/ml-agents/mlagents/trainers/torch/encoders.py
    return height, width


-class InputProcessor:
-    def copy_normalization(self, other_input: "InputProcessor") -> None:
-        pass
-
-    def update_normalization(self, inputs: torch.Tensor) -> None:
-        pass
-
-
-class VectorInput(nn.Module, InputProcessor):
+class VectorInput(nn.Module):
    def __init__(self, input_size: int, normalize: bool = False):
        super().__init__()
        self.normalizer: Optional[Normalizer] = None
            inputs = self.normalizer(inputs)
        return inputs

-    def copy_normalization(self, other_input: "InputProcessor") -> None:
-        if isinstance(other_input, VectorInput):
-            if self.normalizer is not None and other_input.normalizer is not None:
-                self.normalizer.copy_from(other_input.normalizer)
+    def copy_normalization(self, other_input: "VectorInput") -> None:
+        if self.normalizer is not None and other_input.normalizer is not None:
+            self.normalizer.copy_from(other_input.normalizer)

    def update_normalization(self, inputs: torch.Tensor) -> None:
        if self.normalizer is not None:
-class SmallVisualEncoder(nn.Module, InputProcessor):
+class SmallVisualEncoder(nn.Module):
    """
    CNN architecture used by King in their Candy Crush predictor
    https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning
        return self.dense(hidden)


-class SimpleVisualEncoder(nn.Module, InputProcessor):
+class SimpleVisualEncoder(nn.Module):
    def __init__(
        self, height: int, width: int, initial_channels: int, output_size: int
    ):
        return self.dense(hidden)


-class NatureVisualEncoder(nn.Module, InputProcessor):
+class NatureVisualEncoder(nn.Module):
    def __init__(
        self, height: int, width: int, initial_channels: int, output_size: int
    ):
        return input_tensor + self.layers(input_tensor)


-class ResNetVisualEncoder(nn.Module, InputProcessor):
+class ResNetVisualEncoder(nn.Module):
    def __init__(
        self, height: int, width: int, initial_channels: int, output_size: int
    ):
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
 from mlagents.trainers.torch.encoders import VectorInput
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.trajectory import ObsUtil
-from mlagents.trainers.torch.attention import ResidualSelfAttention, EntityEmbedding


 ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
            inputs = torch.cat(encodes + [actions], dim=-1)
        else:
            inputs = torch.cat(encodes, dim=-1)
-        encoding = self.linear_encoder(inputs)
-
-        if self.use_lstm:
-            # Resize to (batch, sequence length, encoding size)
-            encoding = encoding.reshape([-1, sequence_length, self.h_size])
-            encoding, memories = self.lstm(encoding, memories)
-            encoding = encoding.reshape([-1, self.m_size // 2])
-        return encoding, memories
-
-
-# NOTE: this class will be replaced with a multi-head attention when the time comes
-class MultiInputNetworkBody(nn.Module):
-    def __init__(
-        self,
-        sensor_specs: List[SensorSpec],
-        network_settings: NetworkSettings,
-        action_spec: ActionSpec,
-    ):
-        super().__init__()
-        self.normalize = network_settings.normalize
-        self.use_lstm = network_settings.memory is not None
-        # Scale network depending on num agents
-        self.h_size = network_settings.hidden_units
-        self.m_size = (
-            network_settings.memory.memory_size
-            if network_settings.memory is not None
-            else 0
-        )
-        self.processors, _input_size = ModelUtils.create_input_processors(
-            sensor_specs,
-            self.h_size,
-            network_settings.vis_encode_type,
-            normalize=self.normalize,
-        )
-        self.action_spec = action_spec
-
-        # Modules for self-attention
-        obs_only_ent_size = sum(_input_size)
-        q_ent_size = (
-            sum(_input_size)
-            + sum(self.action_spec.discrete_branches)
-            + self.action_spec.continuous_size
-        )
-        self.obs_encoder = EntityEmbedding(
-            0, obs_only_ent_size, None, self.h_size, concat_self=False
-        )
-        self.obs_action_encoder = EntityEmbedding(
-            0, q_ent_size, None, self.h_size, concat_self=False
-        )
-
-        self.self_attn = ResidualSelfAttention(self.h_size)
-
-        encoder_input_size = self.h_size
-
-        self.linear_encoder = LinearEncoder(
-            encoder_input_size, network_settings.num_layers, self.h_size
-        )
-
-        if self.use_lstm:
-            self.lstm = LSTM(self.h_size, self.m_size)
-        else:
-            self.lstm = None  # type: ignore
-
-    @property
-    def memory_size(self) -> int:
-        return self.lstm.memory_size if self.use_lstm else 0
-
-    def update_normalization(self, buffer: AgentBuffer) -> None:
-        obs = ObsUtil.from_buffer(buffer, len(self.processors))
-        for vec_input, enc in zip(obs, self.processors):
-            if isinstance(enc, VectorInput):
-                enc.update_normalization(torch.as_tensor(vec_input))
-
-    def copy_normalization(self, other_network: "NetworkBody") -> None:
-        if self.normalize:
-            for n1, n2 in zip(self.processors, other_network.processors):
-                if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
-                    n1.copy_normalization(n2)
-
-    def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
-        """
-        Get attention masks by grabbing an arbitrary obs across all the agents
-        Since these are raw obs, the padded values are still NaN
-        """
-        only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
-        obs_for_mask = torch.stack(only_first_obs, dim=1)
-        # Get the mask from nans
-        attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
-        return attn_mask
-
-    def q_net(
-        self,
-        obs: List[List[torch.Tensor]],
-        actions: List[AgentAction],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-
-        self_attn_masks = []
-        concat_f_inp = []
-        for inputs, action in zip(obs, actions):
-            encodes = []
-            for idx, processor in enumerate(self.processors):
-                obs_input = inputs[idx]
-                obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
-                processed_obs = processor(obs_input)
-                encodes.append(processed_obs)
-            cat_encodes = [
-                torch.cat(encodes, dim=-1),
-                action.to_flat(self.action_spec.discrete_branches),
-            ]
-            concat_f_inp.append(torch.cat(cat_encodes, dim=1))
-
-        f_inp = torch.stack(concat_f_inp, dim=1)
-        self_attn_masks.append(self._get_masks_from_nans(obs))
-        encoding, memories = self.forward(
-            f_inp,
-            None,
-            self_attn_masks,
-            memories=memories,
-            sequence_length=sequence_length,
-        )
-        return encoding, memories
-
-    def baseline(
-        self,
-        self_obs: List[List[torch.Tensor]],
-        obs: List[List[torch.Tensor]],
-        actions: List[AgentAction],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-
-        self_attn_masks = []
-
-        f_inp = None
-        concat_f_inp = []
-        for inputs, action in zip(obs, actions):
-            encodes = []
-            for idx, processor in enumerate(self.processors):
-                obs_input = inputs[idx]
-                obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
-                processed_obs = processor(obs_input)
-                encodes.append(processed_obs)
-            cat_encodes = [
-                torch.cat(encodes, dim=-1),
-                action.to_flat(self.action_spec.discrete_branches),
-            ]
-            concat_f_inp.append(torch.cat(cat_encodes, dim=1))
-
-        if concat_f_inp:
-            f_inp = torch.stack(concat_f_inp, dim=1)
-            self_attn_masks.append(self._get_masks_from_nans(obs))
-
-        concat_encoded_obs = []
-        encodes = []
-        for idx, processor in enumerate(self.processors):
-            obs_input = self_obs[idx]
-            obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
-            processed_obs = processor(obs_input)
-            encodes.append(processed_obs)
-        concat_encoded_obs.append(torch.cat(encodes, dim=-1))
-        g_inp = torch.stack(concat_encoded_obs, dim=1)
-        # Get the mask from nans
-        self_attn_masks.append(self._get_masks_from_nans([self_obs]))
-        encoding, memories = self.forward(
-            f_inp,
-            g_inp,
-            self_attn_masks,
-            memories=memories,
-            sequence_length=sequence_length,
-        )
-        return encoding, memories
-
-    def value(
-        self,
-        obs: List[List[torch.Tensor]],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-
-        self_attn_masks = []
-        concat_encoded_obs = []
-        for inputs in obs:
-            encodes = []
-            for idx, processor in enumerate(self.processors):
-                obs_input = inputs[idx]
-                obs_input[obs_input.isnan()] = 0.0  # Remove NaNs
-                processed_obs = processor(obs_input)
-                encodes.append(processed_obs)
-            concat_encoded_obs.append(torch.cat(encodes, dim=-1))
-        g_inp = torch.stack(concat_encoded_obs, dim=1)
-        # Get the mask from nans
-        self_attn_masks.append(self._get_masks_from_nans(obs))
-        encoding, memories = self.forward(
-            None,
-            g_inp,
-            self_attn_masks,
-            memories=memories,
-            sequence_length=sequence_length,
-        )
-        return encoding, memories
-
-    def forward(
-        self,
-        f_enc: torch.Tensor,
-        g_enc: torch.Tensor,
-        self_attn_masks: List[torch.Tensor],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-
-        self_attn_inputs = []
-
-        if f_enc is not None:
-            self_attn_inputs.append(self.obs_action_encoder(None, f_enc))
-        if g_enc is not None:
-            self_attn_inputs.append(self.obs_encoder(None, g_enc))
-
-        encoded_entity = torch.cat(self_attn_inputs, dim=1)
-        encoded_state = self.self_attn(encoded_entity, self_attn_masks)
-
-        inputs = encoded_state
        encoding = self.linear_encoder(inputs)

        if self.use_lstm:
        return output, memories


-class CentralizedValueNetwork(ValueNetwork):
-    def __init__(
-        self,
-        stream_names: List[str],
-        observation_shapes: List[SensorSpec],
-        network_settings: NetworkSettings,
-        action_spec: ActionSpec,
-        outputs_per_stream: int = 1,
-    ):
-        # This is not a typo, we want to call __init__ of nn.Module
-        nn.Module.__init__(self)
-        self.network_body = MultiInputNetworkBody(
-            observation_shapes, network_settings, action_spec=action_spec
-        )
-        if network_settings.memory is not None:
-            encoding_size = network_settings.memory.memory_size // 2
-        else:
-            encoding_size = network_settings.hidden_units
-        self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
-
-    def q_net(
-        self,
-        obs: List[List[torch.Tensor]],
-        actions: List[AgentAction],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        encoding, memories = self.network_body.q_net(
-            obs, actions, memories, sequence_length
-        )
-        output = self.value_heads(encoding)
-        return output, memories
-
-    def value(
-        self,
-        obs: List[List[torch.Tensor]],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        encoding, memories = self.network_body.value(obs, memories, sequence_length)
-        output = self.value_heads(encoding)
-        return output, memories
-
-    def baseline(
-        self,
-        self_obs: List[List[torch.Tensor]],
-        obs: List[List[torch.Tensor]],
-        actions: List[AgentAction],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        encoding, memories = self.network_body.baseline(
-            self_obs, obs, actions, memories, sequence_length
-        )
-        output = self.value_heads(encoding)
-        return output, memories
-
-    def forward(
-        self,
-        value_inputs: List[List[torch.Tensor]],
-        q_inputs: List[List[torch.Tensor]],
-        q_actions: List[AgentAction] = None,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
-        encoding, memories = self.network_body(
-            value_inputs, q_inputs, q_actions, memories, sequence_length
-        )
-        output = self.value_heads(encoding)
-        return output, memories
-
-
 class Actor(abc.ABC):
    @abc.abstractmethod
    def update_normalization(self, buffer: AgentBuffer) -> None:
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[
        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
    ]:
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-        team_obs: Optional[List[List[torch.Tensor]]] = None,
-        team_act: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
        encoding, memories = self.network_body(
            inputs, memories=memories, sequence_length=sequence_length
            actor_mem = None
        return actor_mem, critic_mem

-    def target_critic_value(
-        self,
-        inputs: List[torch.Tensor],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-        team_obs: List[List[torch.Tensor]] = None,
-    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
-        actor_mem, critic_mem = self._get_actor_critic_mem(memories)
-
-        all_obs = [inputs]
-        if team_obs is not None and team_obs:
-            all_obs.extend(team_obs)
-
-        value_outputs, critic_mem_out = self.critic.value(
-            all_obs, memories=critic_mem, sequence_length=sequence_length
-        )
-
-        # if mar_value_outputs is None:
-        #    mar_value_outputs = value_outputs
-
-        if actor_mem is not None:
-            # Make memories with the actor mem unchanged
-            memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
-        else:
-            memories_out = None
-        return value_outputs, memories_out
-
-    def critic_value(
-        self,
-        inputs: List[torch.Tensor],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-        team_obs: List[List[torch.Tensor]] = None,
-    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
-        actor_mem, critic_mem = self._get_actor_critic_mem(memories)
-
-        all_obs = [inputs]
-        if team_obs is not None and team_obs:
-            all_obs.extend(team_obs)
-
-        value_outputs, critic_mem_out = self.critic.value(
-            all_obs, memories=critic_mem, sequence_length=sequence_length
-        )
-
-        # if mar_value_outputs is None:
-        #    mar_value_outputs = value_outputs
-
-        if actor_mem is not None:
-            # Make memories with the actor mem unchanged
-            memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
-        else:
-            memories_out = None
-        return value_outputs, memories_out
-
-    def target_critic_pass(
-        self,
-        inputs: List[torch.Tensor],
-        actions: AgentAction,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-        team_obs: List[List[torch.Tensor]] = None,
-        team_act: List[AgentAction] = None,
-    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
-        actor_mem, critic_mem = self._get_actor_critic_mem(memories)
-
-        all_obs = [inputs]
-        if team_obs is not None and team_obs:
-            all_obs.extend(team_obs)
-        all_acts = [actions]
-        if team_act is not None and team_act:
-            all_acts.extend(team_act)
-
-        baseline_outputs, _ = self.critic.baseline(
-            inputs,
-            team_obs,
-            team_act,
-            memories=critic_mem,
-            sequence_length=sequence_length,
-        )
-
-        value_outputs, critic_mem_out = self.critic.q_net(
-            all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
-        )
-
-        # if mar_value_outputs is None:
-        #    mar_value_outputs = value_outputs
-
-        if actor_mem is not None:
-            # Make memories with the actor mem unchanged
-            memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
-        else:
-            memories_out = None
-        return value_outputs, baseline_outputs, memories_out
-
-        actions: AgentAction,
-        team_obs: List[List[torch.Tensor]] = None,
-        team_act: List[AgentAction] = None,
-
-        all_obs = [inputs]
-        if team_obs is not None and team_obs:
-            all_obs.extend(team_obs)
-        all_acts = [actions]
-        if team_act is not None and team_act:
-            all_acts.extend(team_act)
-
-        baseline_outputs, critic_mem_out = self.critic.baseline(
-            inputs,
-            team_obs,
-            team_act,
-            memories=critic_mem,
-            sequence_length=sequence_length,
+        value_outputs, critic_mem_out = self.critic(
+            inputs, memories=critic_mem, sequence_length=sequence_length
-
-        # q_out, critic_mem_out = self.critic.q_net(
-        #     all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
-        # )
-
-        # if mar_value_outputs is None:
-        #    mar_value_outputs = value_outputs
-
-        return baseline_outputs, memories_out
+        return value_outputs, memories_out

    def get_stats_and_value(
        self,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-        team_obs: Optional[List[List[torch.Tensor]]] = None,
-        team_act: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
        actor_mem, critic_mem = self._get_actor_critic_mem(memories)
        encoding, actor_mem_outs = self.network_body(
-
-        baseline_outputs, _ = self.critic_pass(
-            inputs,
-            actions,
-            memories=critic_mem,
-            sequence_length=sequence_length,
-            team_obs=team_obs,
-            team_act=team_act,
-        )
-        value_outputs, _ = self.target_critic_value(
-            inputs,
-            memories=critic_mem,
-            sequence_length=sequence_length,
-            team_obs=team_obs,
+        value_outputs, critic_mem_outs = self.critic(
+            inputs, memories=critic_mem, sequence_length=sequence_length
-        return log_probs, entropies, baseline_outputs, value_outputs
+        return log_probs, entropies, value_outputs

    def get_action_stats(
        self,
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[
        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
    ]:
        )
        action, log_probs, entropies = self.action_model(encoding, masks)
-        all_net_inputs = [inputs]
-        if critic_obs is not None:
-            all_net_inputs.extend(critic_obs)
-            all_net_inputs, memories=critic_mem, sequence_length=sequence_length
+            inputs, memories=critic_mem, sequence_length=sequence_length
        )
        if self.use_lstm:
            mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)