Get next critic observations into value estimate

4 年前 · 56dcd75a
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
 import sys
+import numpy as np
 from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
 from collections import defaultdict, Counter
 import queue
        self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
        self.last_experience: Dict[str, AgentExperience] = {}
        self.last_step_result: Dict[str, Tuple[DecisionStep, int]] = {}
+        # current_obs is used to collect the last seen obs of all the agents, and assemble the next_collab_obs.
+        self.current_obs: Dict[str, List[np.ndarray]] = {}
        # last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
        # grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
        self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
-            self._assemble_trajectory(
-                terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
-            )
+            self._assemble_trajectory(terminal_step, global_id)
+        self.current_obs.clear()
+
        # Clean the last experience dictionary for terminal steps
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
        for ongoing_step in decision_steps.values():
            local_id = ongoing_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
-            self._assemble_trajectory(
-                ongoing_step, global_id, decision_steps.agent_id_to_index[local_id]
-            )
+            self._assemble_trajectory(ongoing_step, global_id)
+        self.current_obs.clear()

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
                interrupted=interrupted,
                memory=memory,
            )
+            self.current_obs[global_id] = step.obs
-        self, step: Union[TerminalStep, DecisionStep], global_id: str, index: int
+        self, step: Union[TerminalStep, DecisionStep], global_id: str
-            # Add remaining obs to AgentExperience
+            # Add remaining shared obs to AgentExperience
            for _id, _exp in self.last_experience.items():
                if _id == global_id:
                    continue
                or terminated
            ):
                next_obs = step.obs
+                next_collab_obs = []
+                for _id, _exp in self.current_obs.items():
+                    if _id == global_id:
+                        continue
+                    else:
+                        next_collab_obs.append(_exp)
+
+                    next_collab_obs=next_collab_obs,
                    behavior_id=self.behavior_id,
                )
                for traj_queue in self.trajectory_queues:
--- a/ml-agents/mlagents/trainers/buffer.py
+++ b/ml-agents/mlagents/trainers/buffer.py
        # Transpose and convert List of Lists
        new_list = list(map(lambda x: np.asanyarray(list(x)), zip(*obs_list)))
        return new_list
+
+    @staticmethod
+    def obs_list_list_to_obs_batch(
+        obs_list_list: List[List[List[np.ndarray]]]
+    ) -> List[List[np.ndarray]]:
+        """
+        Convert a List of List of obs, where one of the dimension is time and the other is number (e.g. in the
+        case of a variable number of critic observations) to a List of obs, where time is in the batch dimension
+        of the obs, and the List is the variable number of agents.
+        """
+        new_list = list(
+            map(
+                lambda x: AgentBuffer.obs_list_to_obs_batch(list(x)),
+                zip(*obs_list_list),
+            )
+        )
+        return new_list
--- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
            )

    def get_trajectory_value_estimates(
-        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
+        self,
+        batch: AgentBuffer,
+        next_obs: List[np.ndarray],
+        next_critic_obs: List[List[np.ndarray]],
+        done: bool,
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
        obs = ModelUtils.list_to_tensor_list(
            AgentBuffer.obs_list_to_obs_batch(batch["obs"])
-        # This line doesn't work
-        critic_obs = [ModelUtils.list_to_tensor_list(AgentBuffer.obs_list_to_obs_batch(agent_obs)) for agent_obs in batch["critic_obs"]]
+        critic_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["critic_obs"])
+        critic_obs = [
+            ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
+        ]
+        next_critic_obs = [
+            ModelUtils.list_to_tensor_list(_obs) for _obs in next_critic_obs
+        ]

        memory = torch.zeros([1, 1, self.policy.m_size])


        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
-            next_obs, next_memory, sequence_length=1
+            next_obs, next_memory, sequence_length=1, critic_obs=next_critic_obs
        )

        for name, estimate in value_estimates.items():
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
        obs = ModelUtils.list_to_tensor_list(
            AgentBuffer.obs_list_to_obs_batch(batch["obs"])
        )
-        critic_obs = [ModelUtils.list_to_tensor_list(AgentBuffer.obs_list_to_obs_batch(agent_obs)) for agent_obs in batch["critic_obs"]]
+        critic_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["critic_obs"])
+        critic_obs = [
+            ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
+        ]

        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
        if self.policy.use_continuous_act:
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory,
            trajectory.next_obs,
+            trajectory.next_collab_obs,
            trajectory.done_reached and not trajectory.interrupted,
        )

--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
    next_obs: List[
        np.ndarray
    ]  # Observation following the trajectory, for bootstrapping
+    next_collab_obs: List[List[np.ndarray]]
    agent_id: str
    behavior_id: str