Actions added but untested

4 年前 · 65b866b0
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
    StatsAggregationMethod,
    EnvironmentStats,
 )
-from mlagents.trainers.trajectory import Trajectory, AgentExperience
+from mlagents.trainers.trajectory import TeammateStatus, Trajectory, AgentExperience
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
 from mlagents.trainers.torch.action_log_probs import LogProbsTuple
        )
        # last_group_obs is used to collect the last seen obs of all the agents in the same group,
        # and assemble the collab_obs.
-        self.last_group_obs: Dict[str, Dict[str, List[np.ndarray]]] = defaultdict(
-            lambda: defaultdict(list)
-        )
-        # current_group_rewards is used to collect the last seen rewards of all the agents in the same group.
-        self.current_group_rewards: Dict[str, Dict[str, float]] = defaultdict(
-            lambda: defaultdict(float)
+        self.teammate_status: Dict[str, Dict[str, TeammateStatus]] = defaultdict(
+            lambda: defaultdict(None)
        )
        # last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
        # grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
        self, step: Union[TerminalStep, DecisionStep], global_id: str
    ) -> None:
        stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
-        if stored_decision_step is not None:
+        stored_take_action_outputs = self.last_take_action_outputs.get(global_id, None)
+        if stored_decision_step is not None and stored_take_action_outputs is not None:
-                self.last_group_obs[step.team_manager_id][
-                    global_id
-                ] = stored_decision_step.obs
+                stored_actions = stored_take_action_outputs["action"]
+                action_tuple = ActionTuple(
+                    continuous=stored_actions.continuous[idx],
+                    discrete=stored_actions.discrete[idx],
+                )
+                teammate_status = TeammateStatus(
+                    obs=stored_decision_step.obs,
+                    reward=step.reward,
+                    action=action_tuple,
+                )
+                self.teammate_status[step.team_manager_id][global_id] = teammate_status
-                self.current_group_rewards[step.team_manager_id][
-                    global_id
-                ] = step.reward
-        self._delete_in_nested_dict(self.last_group_obs, global_id)
-        self._delete_in_nested_dict(self.current_group_rewards, global_id)
+        self._delete_in_nested_dict(self.teammate_status, global_id)

    def _delete_in_nested_dict(self, nested_dict, key):
        for _manager_id, _team_group in nested_dict.items():
            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]

            # Assemble teammate_obs. If none saved, then it will be an empty list.
-            collab_obs = []
-            for _id, _obs in self.last_group_obs[step.team_manager_id].items():
+            teammate_statuses = []
+            for _id, _obs in self.teammate_status[step.team_manager_id].items():
-                    collab_obs.append(_obs)
-            teammate_rewards = []
-            for _id, _rew in self.current_group_rewards[step.team_manager_id].items():
-                if _id != global_id:
-                    teammate_rewards.append(_rew)
+                    teammate_statuses.append(_obs)
-                collab_obs=collab_obs,
+                teammate_status=teammate_statuses,
-                team_rewards=teammate_rewards,
                done=done,
                action=action_tuple,
                action_probs=log_probs_tuple,
--- a/ml-agents/mlagents/trainers/torch/agent_action.py
+++ b/ml-agents/mlagents/trainers/torch/agent_action.py
 from typing import List, Optional, NamedTuple, Dict
+import itertools
+from mlagents.trainers.buffer import AgentBuffer


 class AgentAction(NamedTuple):
            discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
            action_tuple.add_discrete(discrete)
        return action_tuple
+
+    @staticmethod
+    def _padded_time_to_batch(
+        agent_buffer_field: AgentBuffer.AgentBufferField,
+        dtype: torch.dtype = torch.float32,
+    ) -> List[torch.Tensor]:
+        action_shape = None
+        for _action in agent_buffer_field:
+            if _action:
+                action_shape = _action.shape
+                break
+        # If there were no critic obs at all
+        if action_shape is None:
+            return []
+
+        new_list = list(
+            map(
+                lambda x: ModelUtils.list_to_tensor(x, dtype=dtype),
+                itertools.zip_longest(
+                    *agent_buffer_field, fillvalue=np.full(action_shape, np.nan)
+                ),
+            )
+        )
+        return new_list

    @staticmethod
    def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
            ]
        return AgentAction(continuous, discrete)
+
+    @staticmethod
+    def from_team_dict(buff: Dict[str, np.ndarray]) -> List["AgentAction"]:
+        """
+        A static method that accesses continuous and discrete action fields in an AgentBuffer
+        and constructs the corresponding AgentAction from the retrieved np arrays.
+        """
+        continuous_tensors: List[torch.Tensor] = []
+        discrete_tensors: List[torch.Tensor] = []  # type: ignore
+        if "team_continuous_action" in buff:
+            continuous_tensors = AgentAction._padded_time_to_batch(
+                buff["team_continuous_action"]
+            )
+        if "team_discrete_action" in buff:
+            discrete_tensors = AgentAction._padded_time_to_batch(
+                buff["team_discrete_action"], dtype=torch.long
+            )
+
+        actions_list = []
+        for _cont, _disc in itertools.zip_longest(
+            continuous_tensors, discrete_tensors, fillvalue=None
+        ):
+            if _disc is not None:
+                _disc = [_disc[..., i] for i in range(_disc.shape[-1])]
+            actions_list.append(AgentAction(_cont, _disc))
+        return actions_list

    def to_flat(self, discrete_branches: List[int]) -> torch.Tensor:
        discrete_oh = ModelUtils.actions_to_onehot(
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py


@attr.s(auto_attribs=True)
+class TeammateStatus:
+    """
+    Stores data related to an agent's teammate.
+    """
+
+    obs: List[np.ndarray]
+    reward: float
+    action: ActionTuple
+
+
+@attr.s(auto_attribs=True)
-    collab_obs: List[List[np.ndarray]]
+    teammate_status: List[TeammateStatus]
-    team_rewards: List[float]
    done: bool
    action: ActionTuple
    action_probs: LogProbsTuple
                agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])

+            teammate_continuous_actions, teammate_discrete_actions, teammate_rewards = (
+                [],
+                [],
+                [],
+            )
+            for teammate_status in exp.teammate_status:
+                teammate_rewards.append(teammate_status.reward)
+                teammate_continuous_actions.append(teammate_status.action.continuous)
+                teammate_discrete_actions.append(teammate_status.action.discrete)
+
-                for _team_obs in exp.collab_obs:
+                for _teammate_status in exp.teammate_status:
-                    ith_team_obs.append(_team_obs[i])
+                    ith_team_obs.append(_teammate_status.obs[i])
-            agent_buffer_trajectory["team_rewards"].append(exp.team_rewards)
+            agent_buffer_trajectory["team_rewards"].append(teammate_rewards)

            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)
            # Adds the log prob and action of continuous/discrete separately
            agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
            agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
+
+            # Team actions
+            agent_buffer_trajectory["team_continuous_action"].append(
+                teammate_continuous_actions
+            )
+            agent_buffer_trajectory["team_discrete_action"].append(
+                teammate_discrete_actions
+            )
+
            agent_buffer_trajectory["continuous_log_probs"].append(
                exp.action_probs.continuous
            )