ml-agents/ml-agents/mlagents/trainers/agent_processor.py


								from typing import List, Dict

								from collections import defaultdict

								import numpy as np


								from mlagents.trainers.buffer import AgentBuffer, BufferException

								from mlagents.envs.policy import Policy

								from mlagents.envs.exception import UnityException

								from mlagents.envs.brain import BrainInfo

								from mlagents.envs.action_info import ActionInfoOutputs


								class AgentProcessorException(UnityException):

								    """

								    Related to errors with the AgentProcessor.

								    """


								    pass


								class AgentProcessor:

								    """

								    AgentProcessor contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.

								    Buffer also contains an update_buffer that corresponds to the buffer used when updating the model.

								    """


								    def __init__(self, policy: Policy):

								        self.processing_buffer = ProcessingBuffer()

								        self.stats: Dict[str, List] = defaultdict(list)

								        # Note: this is needed until we switch to AgentExperiences as the data input type.

								        # We still need some info from the policy (memories, previous actions)

								        # that really should be gathered by the env-manager.

								        self.policy = policy

								        self.episode_steps: Dict[str, int] = {}


								    def __str__(self):

								        return "local_buffers :\n{0}".format(

								            "\n".join(

								                [

								                    "\tagent {0} :{1}".format(k, str(self.processing_buffer[k]))

								                    for k in self.processing_buffer.keys()

								                ]

								            )

								        )


								    def reset_local_buffers(self) -> None:

								        """

								        Resets all the local local_buffers

								        """

								        agent_ids = list(self.processing_buffer.keys())

								        for k in agent_ids:

								            self.processing_buffer[k].reset_agent()


								    def add_experiences(

								        self,

								        curr_info: BrainInfo,

								        next_info: BrainInfo,

								        take_action_outputs: ActionInfoOutputs,

								    ) -> None:

								        """

								        Adds experiences to each agent's experience history.

								        :param curr_info: current BrainInfo.

								        :param next_info: next BrainInfo.

								        :param take_action_outputs: The outputs of the Policy's get_action method.

								        """

								        if take_action_outputs:

								            self.stats["Policy/Entropy"].append(take_action_outputs["entropy"].mean())

								            self.stats["Policy/Learning Rate"].append(

								                take_action_outputs["learning_rate"]

								            )

								            for name, values in take_action_outputs["value_heads"].items():

								                self.stats[name].append(np.mean(values))


								        for agent_id in curr_info.agents:

								            self.processing_buffer[agent_id].last_brain_info = curr_info

								            self.processing_buffer[

								                agent_id

								            ].last_take_action_outputs = take_action_outputs


								        # Store the environment reward

								        tmp_environment = np.array(next_info.rewards)


								        for agent_id in next_info.agents:

								            stored_info = self.processing_buffer[agent_id].last_brain_info

								            stored_take_action_outputs = self.processing_buffer[

								                agent_id

								            ].last_take_action_outputs

								            if stored_info is not None:

								                idx = stored_info.agents.index(agent_id)

								                next_idx = next_info.agents.index(agent_id)

								                if not stored_info.local_done[idx]:

								                    for i, _ in enumerate(stored_info.visual_observations):

								                        self.processing_buffer[agent_id]["visual_obs%d" % i].append(

								                            stored_info.visual_observations[i][idx]

								                        )

								                        self.processing_buffer[agent_id][

								                            "next_visual_obs%d" % i

								                        ].append(next_info.visual_observations[i][next_idx])

								                    if self.policy.use_vec_obs:

								                        self.processing_buffer[agent_id]["vector_obs"].append(

								                            stored_info.vector_observations[idx]

								                        )

								                        self.processing_buffer[agent_id]["next_vector_in"].append(

								                            next_info.vector_observations[next_idx]

								                        )

								                    if self.policy.use_recurrent:

								                        self.processing_buffer[agent_id]["memory"].append(

								                            self.policy.retrieve_memories([agent_id])[0, :]

								                        )


								                    self.processing_buffer[agent_id]["masks"].append(1.0)

								                    self.processing_buffer[agent_id]["done"].append(

								                        next_info.local_done[next_idx]

								                    )

								                    # Add the outputs of the last eval

								                    self.add_policy_outputs(stored_take_action_outputs, agent_id, idx)


								                    # Store action masks if necessary. Eventually these will be

								                    # None for continuous actions

								                    if stored_info.action_masks[idx] is not None:

								                        self.processing_buffer[agent_id]["action_mask"].append(

								                            stored_info.action_masks[idx], padding_value=1

								                        )


								                    # TODO: This should be done by the env_manager, and put it in

								                    # the AgentExperience

								                    self.processing_buffer[agent_id]["prev_action"].append(

								                        self.policy.retrieve_previous_action([agent_id])[0, :]

								                    )


								                    values = stored_take_action_outputs["value_heads"]


								                    # Add the value outputs if needed

								                    self.processing_buffer[agent_id]["environment_rewards"].append(

								                        tmp_environment

								                    )


								                    for name, value in values.items():

								                        self.processing_buffer[agent_id][

								                            "{}_value_estimates".format(name)

								                        ].append(value[idx][0])


								                if not next_info.local_done[next_idx]:

								                    if agent_id not in self.episode_steps:

								                        self.episode_steps[agent_id] = 0

								                    self.episode_steps[agent_id] += 1

								        self.policy.save_previous_action(

								            curr_info.agents, take_action_outputs["action"]

								        )


								    def add_policy_outputs(

								        self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int

								    ) -> None:

								        """

								        Takes the output of the last action and store it into the training buffer.

								        """

								        actions = take_action_outputs["action"]

								        if self.policy.use_continuous_act:

								            actions_pre = take_action_outputs["pre_action"]

								            self.processing_buffer[agent_id]["actions_pre"].append(

								                actions_pre[agent_idx]

								            )

								            epsilons = take_action_outputs["random_normal_epsilon"]

								            self.processing_buffer[agent_id]["random_normal_epsilon"].append(

								                epsilons[agent_idx]

								            )

								        a_dist = take_action_outputs["log_probs"]

								        # value is a dictionary from name of reward to value estimate of the value head

								        self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])

								        self.processing_buffer[agent_id]["action_probs"].append(a_dist[agent_idx])


								    def process_experiences(self):

								        pass


								class ProcessingBuffer(dict):

								    """

								    ProcessingBuffer contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.

								    """


								    def __str__(self):

								        return "local_buffers :\n{0}".format(

								            "\n".join(["\tagent {0} :{1}".format(k, str(self[k])) for k in self.keys()])

								        )


								    def __getitem__(self, key):

								        if key not in self.keys():

								            self[key] = AgentBuffer()

								        return super().__getitem__(key)


								    def reset_local_buffers(self) -> None:

								        """

								        Resets all the local AgentBuffers.

								        """

								        for buf in self.values():

								            buf.reset_agent()


								    def append_to_update_buffer(

								        self,

								        update_buffer: AgentBuffer,

								        agent_id: str,

								        key_list: List[str] = None,

								        batch_size: int = None,

								        training_length: int = None,

								    ) -> None:

								        """

								        Appends the buffer of an agent to the update buffer.

								        :param update_buffer: A reference to an AgentBuffer to append the agent's buffer to

								        :param agent_id: The id of the agent which data will be appended

								        :param key_list: The fields that must be added. If None: all fields will be appended.

								        :param batch_size: The number of elements that must be appended. If None: All of them will be.

								        :param training_length: The length of the samples that must be appended. If None: only takes one element.

								        """

								        if key_list is None:

								            key_list = self[agent_id].keys()

								        if not self[agent_id].check_length(key_list):

								            raise BufferException(

								                "The length of the fields {0} for agent {1} were not of same length".format(

								                    key_list, agent_id

								                )

								            )

								        for field_key in key_list:

								            update_buffer[field_key].extend(

								                self[agent_id][field_key].get_batch(

								                    batch_size=batch_size, training_length=training_length

								                )

								            )


								    def append_all_agent_batch_to_update_buffer(

								        self,

								        update_buffer: AgentBuffer,

								        key_list: List[str] = None,

								        batch_size: int = None,

								        training_length: int = None,

								    ) -> None:

								        """

								        Appends the buffer of all agents to the update buffer.

								        :param key_list: The fields that must be added. If None: all fields will be appended.

								        :param batch_size: The number of elements that must be appended. If None: All of them will be.

								        :param training_length: The length of the samples that must be appended. If None: only takes one element.

								        """

								        for agent_id in self.keys():

								            self.append_to_update_buffer(

								                update_buffer, agent_id, key_list, batch_size, training_length

								            )